C Language Processing Chinese

Source: Internet
Author: User

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

Function: Keep only Chinese, English, numerals and spaces, will.!?; Replace with line break

Char *repunlawchar (char *instr, char *outstr) {

#define Split_char ' \ n '

if (instr = = NULL) {

Instr[0] = ' + ';

outstr = NULL;

return outstr;

}

int i = 0;

int j = 0;

for (; I < strlen (instr); i++, J + +) {

unsigned char tmp = instr[i];

if (TMP >= ' a ' && tmp <= ' z ') {

OUTSTR[J] = instr[i];//processing in English

}else if (tmp >= ' A ' && tmp <= ' Z ') {

OUTSTR[J] = Instr[i] + ' A '-' a ';//uppercase to lowercase

}else if (tmp >= ' 0 ' && tmp <= ' 9 ') {

OUTSTR[J] = Instr[i]; Working with numbers

}else if (tmp = = '? ' | | | tmp = = '! ' | | tmp = = '; ') {//segmentation

if (--j >= 0 && outstr[j]! = Split_char) {

OUTSTR[++J] = Split_char;

}

}else if (tmp = = 0xa3) {//Handles full-width letters, numbers, punctuation, special characters

unsigned char tmp = instr[++i];

if (tmp = = 0XA1 | | tmp = = 0xBB | | tmp = = 0xBF) {//segmentation

if (--j >= 0 && outstr[j]! = Split_char) {

OUTSTR[++J] = Split_char;

}

}else{

printf ("%d%d\n", (unsigned char) instr[i-1], TMP);

if ((tmp-0x80) >= ' A ' && (tmp-0x80) <= ' Z ') {//full-width turn half-width, uppercase to lowercase

OUTSTR[J] = tmp-0x80 + ' A '-' a ';

printf ("-->%s%d%d\n\n\n", InStr, TMP, tmp-0x80 + ' a '-' a ');

}else if ((tmp-0x80) >= ' a ' && (tmp-0x80) <= ' z ') | | ((tmp-0x80) >= ' 0 ' && (tmp-0x80) <= ' 9 ')) {//full-width turn half-width

OUTSTR[J] = tmp-0x80;

}else{

if (--j >= 0 && outstr[j]! = "' && outstr[j]! = ' \ n ') {//Replace other special characters with spaces

OUTSTR[++J] = ";

}

}

}

}else if (tmp = = 0XA1) {//handle special word character period

TMP = (unsigned char) instr[++i];

if (tmp = = 0xa3) {

if (--j >= 0 && outstr[j]! = ' \ n ') {//Segmentation processing period

OUTSTR[++J] = ' \ n ';

}

}else{

if (--j >= 0 && outstr[j]! = "' && outstr[j]! = ' \ n ') {//replace special characters with spaces

OUTSTR[++J] = ";

}

}

Working with Chinese

}else if (TMP >= 0x80 && tmp <= 0xA0 | | tmp >= 0XB0 && tmp <= 0xf7 | | tmp >= 0xAA && TMP <= 0xAF | | TMP >= 0xF8 && tmp <= 0xFE) {

TMP = (unsigned char) instr[++i];

if (TMP >= 0x40 && tmp <= 0xFE && tmp! = 0x7F) {

OUTSTR[J] = instr[--i];

OUTSTR[++J] = Instr[++i];

}

}else{//Handling Other characters

If the first byte is greater than 128, jumps two bytes and does not retain the character

if (tmp > 0x80 && tmp! = 0xFF && instr[i + 1]! = ' + ') {

i++;

}

if (--j >= 0 && outstr[j]! = ' && outstr[j]! = ' \ n ') {

OUTSTR[++J] = ";

}

}

}

OUTSTR[J] = ' + ';

if ((j-1) >= 0 && (outstr[j-1] = = ' \ n ' | | outstr[j-1] = = ")) {

Outstr[j-1] = ' + ';

}

printf ("j:%d--len:%d\n", J,strlen (OUTSTR));

return outstr;

}

int main (int argc, char *argv[]) {

#define Line_len 10240

if (argc! = 3) {

printf ("Usage:%s Incorpus[in] outcorpus[out].\n", argv[0]);

Exit (-1);

}

FILE *in = fopen (Argv[1], "R");

if (in = = NULL) {

printf ("Open File:%s error.", Argv[1]);

Exit (-1);

}

FILE *out = fopen (Argv[2], "w");

Char Line[line_len];

Char Dealline[line_len];

char *pl = line;

char *pd = dealline;

memset (PL, ' n ', Line_len);

memset (PD, ' n ', Line_len);

while (Fgets (line, Line_len, in) = NULL) {

if (Line[strlen line)-1] = = ' \ r ' | | Line[strlen (LINE)-1] = = ' \ n ') {

Line[strlen (line)-1] = ' + ';

}

printf ("Ori:%s\n", PL);

Repunlawchar (PL,PD);

if (pd! = NULL && strlen (PD) > 0) {

printf ("ULA:%s\n", PD);

fprintf (out, "%s\n", PD);

}

memset (PL, ' n ', Line_len);

memset (PD, ' n ', Line_len);

Break

}

PL = NULL;

PD = NULL;

Fclose (in);

Fclose (out);

return 1;

}

C Language Processing Chinese

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.