python使用帶漢字的Regex
來源:互聯網
上載者:User
#!/usr/bin/python 2 # -*- coding: cp936 -*- 3 import sys; 4 import re; 5 ''' 6 原始檔案,去重後檔案,重複檔案 7 ''' 8 if( __name__ == "__main__" ): 1 #!/usr/bin/python 2 # -*- coding: cp936 -*- 3 import sys; 4 import re; 5 ''' 6 原始檔案,去重後檔案,重複檔案 7 ''' 8 if( __name__ == "__main__" ): 9 fid_input = file(str(sys.argv[1]),'r'); 10 fid_pure=file(str(sys.argv[2]),'w'); 11 fid_assist=file(str(sys.argv[3]),'w'); 12 mydict={}; 13 phanzi=re.compile(u'[\u4e00-\u9fa5]'); 14 preturn=re.compile(u'(^\s+|\s+$)');#去掉首尾空白 15 pfilter=re.compile(u'[”“(\(\))]');#去掉無關字元 16 for line in fid_input: 17 uline=unicode(line,'gbk'); 18 candidates=phanzi.findall(uline); 19 unewline=u''; 20 for m in candidates: 21 unewline=unewline+m; 22 unewline=pfilter.sub('',unewline); 23 unewline=preturn.sub('',unewline); 24 newline=unewline.encode('gbk'); 25 fid_assist.write(newline); 26 fid_assist.write('\n'); 27 if(not mydict.has_key(newline)): 28 mydict[newline]=1; 29 for mykey in mydict.keys(): 30 fid_pure.write(mykey); 31 fid_pure.write('\n'); 32 fid_pure.close(); 33 fid_input.close(); 34 fid_assist.close(); 35 print 'procedure %s finish!\n'%str( sys.argv[0] );