#!/usr/bin/python
#-*- coding:cp936-*-
#思路,將str轉換成unicode,方可用Regex,前提是,要知道檔案的編碼,本例中是gbk
import cPickle as mypickle
import re
import sys
if (__name__=='__main__'):
fid1=file('above50purenames.txt','r');
p=re.compile('(^\s+|\s+$)');
phanzigbk=re.compile('[\\x20-\\x7f]');
phanzi=re.compile(u'[\u4e00-\u9fa5]');#這裡要加u,注意
commlines=fid1.readlines();
fid1.close();
dictfamilyname={};
dictfirstname={};
for line in commlines:
line=p.sub('',line);
print type(line);
print line;
uline=unicode(line,'gbk');
print type(uline);
candidates=phanzi.findall(uline);
print len(candidates);
if(len(candidates)==2):
print candidates[0];
familynamegbk=candidates[0].encode('gbk');#把unicode型的變數變成str型的變數
firstnamegbk=candidates[1].encode('gbk');
if(dictfamilyname.has_key(familynamegbk)):
dictfamilyname[familynamegbk]=dictfamilyname[familynamegbk]+1;
else:
dictfamilyname[familynamegbk]=1;
if(dictfirstname.has_key(firstnamegbk)):
dictfirstname[firstnamegbk]=dictfirstname[firstnamegbk]+1;
else:
dictfirstname[firstnamegbk]=1;
familynameitems=dictfamilyname.items();
print familynameitems;
firstnameitems=dictfirstname.items();
familynameitems.sort(key=lambda d:d[1],reverse=True);
firstnameitems.sort(key=lambda d :d[1],reverse=True);
fid=file('familyname.txt','w');
for m in familynameitems:
s=m[0]+'\t'+str(m[1]);
fid.write(s);
fid.write('\n');
fid.close();
fid=file('firstname.txt','w');
for m in firstnameitems:
s=m[0]+'\t'+str(m[1]);
fid.write(s);
fid.write('\n');
fid.close();
print 'finish'