#!/usr/bin/python
#-*- coding:gbk-*-
'''
spec:根據是否命中126W人名,將usrdict分為兩個部分
parms:
[IN]
[IN]
[OUT]
author: liuyusi0121@sogou-inc.com date 20120808
'''
import re;
import sys;
def LoadKeys(filename):
'''
載入key到記憶體
'''
keys=[];
p=re.compile('^\s+|\s+$');
fid=file(filename,"r");
temp=fid.readlines();
fid.close();
for line in temp:
line=p.sub('',line);
keys.append(line);
return keys;
def PrintUsage():
print 'program [IN] keywords.txt [IN]file.txt [OUT] matched.txt [OUT] notmatched.txt [OUT] ufuwfoverflow';
exit(1);
if(__name__=="__main__"):
delim="\t";
p=re.compile("(^\\s+|\\s+$)");
if(len(sys.argv)!=6):
PrintUsage();
keyfile=str(sys.argv[1]);
keys=LoadKeys(keyfile);
print len(keys);
inputfile=str(sys.argv[2]);
outputfile1=str(sys.argv[3]);
outputfile2=str(sys.argv[4]);
outputfile3=str(sys.argv[5]);
fout1=open(outputfile1,'w');
fout2=open(outputfile2,'w');
fout3=open(outputfile3,'w');
fid=open(inputfile,"r");
linecount=0;
while True:
line=fid.readline();
flag=0;
if(0==len(line)):
break;
line=p.sub('',line);
if(''==line):
continue;
if(0==linecount%100000):
print '語料已經處理%d行'%linecount;
linecount=linecount+1;
linesegs=line.split("\t");
if(4!=len(linesegs)):
continue;
if(int(linesegs[2])<=0 or int(linesegs[3])<=0):
fout3.write(line);
fout3.write("\n");
continue;
try:
useg=unicode(linesegs[0],'gbk');
count=0;
for key in keys:
if(0==count%100000):
print '模式已經掃描%d個'%count;
count=count+1;
patternstr="(^"+key+"|"+key+"$)";
try:
upatternstr=unicode(patternstr,"gbk");
pattern=re.compile(upatternstr);
if(pattern.search(useg)):
print line;
flag=1;
linesegs.append(key)
newline=delim.join(linesegs);
fout1.write(newline);
fout1.write("\n");
break;
except UnicodeDecodeError:
pass;
except:
pass;
if(flag==0):
linesegs.append("_");
newline=delim.join(linesegs);
fout2.write(newline);
fout2.write("\n");
fid.close();
fout1.close();
fout2.close();
fout3.close();