# ! /Usr/bin/Python
# -*-Coding: GBK -*-
'''
SPEC: usrdict is divided into two parts based on whether to hit million people
Parms:
[In]
[In]
[Out]
Author: liuyusi0121@sogou-inc.com date 20120808
'''
Import Re;
Import Sys;
Def Loadkeys (filename ):
'''
Load key to memory
'''
Keys = [];
P = Re. Compile ( ' ^ \ S + | \ s + $ ' );
FID = file (filename, " R " );
Temp = FID. readlines ();
FID. Close ();
For Line In Temp:
Line = P. sub ( '' , Line );
Keys. append (line );
ReturnKeys;
DefPrintusage ():
Print 'Program [in] keywords.txt extends in‑file.txt [out] matched.txt [out] notmatched.txt [out] ufuwfoverflow';
Exit (1 );
If ( _ Name __ = " _ Main __ " ):
Delim = " \ T " ;
P = Re. Compile ( " (^ \ S + | \ s + $) " );
If (LEN (SYS. argv )! = 6 ):
Printusage ();
Keyfile = STR (SYS. argv [1]);
Keys = loadkeys (Keyfile );
Print Len (KEYS );
Inputfile = STR (SYS. argv [2]);
Outputfile1 = STR (SYS. argv [3]);
Outputfile2 = STR (SYS. argv [4]);
Outputfile3 = STR (SYS. argv [5]);
Fout1 = open (outputfile1, ' W ' );
Fout2 = open (outputfile2, ' W ' );
Fout3 = open (outputfile3, ' W ' );
FID = open (inputfile, " R " );
Linecount = 0;
While True:
Line = FID. Readline ();
Flag = 0;
If (0 = Len (line )):
Break ;
Line = P. sub ( '' , Line );
If ( '' = Line ):
Continue ;
If (0 = linecount % 100000 ):
Print ' The corpus has processed % d rows. ' % Linecount;
Linecount = linecount + 1;
Linesegs = line. Split ( " \ T " );
If (4! = Len (linesegs )):
Continue ;
If (INT (linesegs [2]) <= 0 Or INT (linesegs [3]) <= 0 ):
Fout3.write (line );
Fout3.write (" \ N " );
Continue ;
Try :
Useg = Unicode (linesegs [0], ' GBK ' );
Count = 0;
For Key In Keys:
If (0 = count % 100000 ):
Print ' % D scanned in Mode ' % Count;
Count = count + 1;
Patternstr = " (^ " + Key + " | " + Key + " $) " ;
Try :
Upatternstr = Unicode (patternstr, " GBK " );
Pattern = Re. Compile (upatternstr );
If (Pattern. Search (useg )):
Print Line;
Flag = 1;
Linesegs. append (key)
Newline = delim. Join (linesegs );
Fout1.write (newline );
Fout1.write (" \ N " );
Break ;
Except Unicodedecodeerror:
Pass ;
Except :
Pass ;
If (Flag = 0 ):
Linesegs. append ( " _ " );
Newline = delim. Join (linesegs );
Fout2.write (newline );
Fout2.write ( " \ N " );
FID. Close ();
Fout1.close ();
Fout2.close ();
Fout3.close ();