#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Function: Keep only Chinese, English, numerals and spaces, will.!?; Replace with line break
Char *repunlawchar (char *instr, char *outstr) {
#define Split_char ' \ n '
if (instr = = NULL) {
Instr[0] = ' + ';
outstr = NULL;
return outstr;
}
int i = 0;
int j = 0;
for (; I < strlen (instr); i++, J + +) {
unsigned char tmp = instr[i];
if (TMP >= ' a ' && tmp <= ' z ') {
OUTSTR[J] = instr[i];//processing in English
}else if (tmp >= ' A ' && tmp <= ' Z ') {
OUTSTR[J] = Instr[i] + ' A '-' a ';//uppercase to lowercase
}else if (tmp >= ' 0 ' && tmp <= ' 9 ') {
OUTSTR[J] = Instr[i]; Working with numbers
}else if (tmp = = '? ' | | | tmp = = '! ' | | tmp = = '; ') {//segmentation
if (--j >= 0 && outstr[j]! = Split_char) {
OUTSTR[++J] = Split_char;
}
}else if (tmp = = 0xa3) {//Handles full-width letters, numbers, punctuation, special characters
unsigned char tmp = instr[++i];
if (tmp = = 0XA1 | | tmp = = 0xBB | | tmp = = 0xBF) {//segmentation
if (--j >= 0 && outstr[j]! = Split_char) {
OUTSTR[++J] = Split_char;
}
}else{
printf ("%d%d\n", (unsigned char) instr[i-1], TMP);
if ((tmp-0x80) >= ' A ' && (tmp-0x80) <= ' Z ') {//full-width turn half-width, uppercase to lowercase
OUTSTR[J] = tmp-0x80 + ' A '-' a ';
printf ("-->%s%d%d\n\n\n", InStr, TMP, tmp-0x80 + ' a '-' a ');
}else if ((tmp-0x80) >= ' a ' && (tmp-0x80) <= ' z ') | | ((tmp-0x80) >= ' 0 ' && (tmp-0x80) <= ' 9 ')) {//full-width turn half-width
OUTSTR[J] = tmp-0x80;
}else{
if (--j >= 0 && outstr[j]! = "' && outstr[j]! = ' \ n ') {//Replace other special characters with spaces
OUTSTR[++J] = ";
}
}
}
}else if (tmp = = 0XA1) {//handle special word character period
TMP = (unsigned char) instr[++i];
if (tmp = = 0xa3) {
if (--j >= 0 && outstr[j]! = ' \ n ') {//Segmentation processing period
OUTSTR[++J] = ' \ n ';
}
}else{
if (--j >= 0 && outstr[j]! = "' && outstr[j]! = ' \ n ') {//replace special characters with spaces
OUTSTR[++J] = ";
}
}
Working with Chinese
}else if (TMP >= 0x80 && tmp <= 0xA0 | | tmp >= 0XB0 && tmp <= 0xf7 | | tmp >= 0xAA && TMP <= 0xAF | | TMP >= 0xF8 && tmp <= 0xFE) {
TMP = (unsigned char) instr[++i];
if (TMP >= 0x40 && tmp <= 0xFE && tmp! = 0x7F) {
OUTSTR[J] = instr[--i];
OUTSTR[++J] = Instr[++i];
}
}else{//Handling Other characters
If the first byte is greater than 128, jumps two bytes and does not retain the character
if (tmp > 0x80 && tmp! = 0xFF && instr[i + 1]! = ' + ') {
i++;
}
if (--j >= 0 && outstr[j]! = ' && outstr[j]! = ' \ n ') {
OUTSTR[++J] = ";
}
}
}
OUTSTR[J] = ' + ';
if ((j-1) >= 0 && (outstr[j-1] = = ' \ n ' | | outstr[j-1] = = ")) {
Outstr[j-1] = ' + ';
}
printf ("j:%d--len:%d\n", J,strlen (OUTSTR));
return outstr;
}
int main (int argc, char *argv[]) {
#define Line_len 10240
if (argc! = 3) {
printf ("Usage:%s Incorpus[in] outcorpus[out].\n", argv[0]);
Exit (-1);
}
FILE *in = fopen (Argv[1], "R");
if (in = = NULL) {
printf ("Open File:%s error.", Argv[1]);
Exit (-1);
}
FILE *out = fopen (Argv[2], "w");
Char Line[line_len];
Char Dealline[line_len];
char *pl = line;
char *pd = dealline;
memset (PL, ' n ', Line_len);
memset (PD, ' n ', Line_len);
while (Fgets (line, Line_len, in) = NULL) {
if (Line[strlen line)-1] = = ' \ r ' | | Line[strlen (LINE)-1] = = ' \ n ') {
Line[strlen (line)-1] = ' + ';
}
printf ("Ori:%s\n", PL);
Repunlawchar (PL,PD);
if (pd! = NULL && strlen (PD) > 0) {
printf ("ULA:%s\n", PD);
fprintf (out, "%s\n", PD);
}
memset (PL, ' n ', Line_len);
memset (PD, ' n ', Line_len);
Break
}
PL = NULL;
PD = NULL;
Fclose (in);
Fclose (out);
return 1;
}
C Language Processing Chinese