python Regex匹配中文

來源:互聯網
上載者:User
#!/usr/bin/python
#-*- coding:cp936-*-

#思路,將str轉換成unicode,方可用Regex,前提是,要知道檔案的編碼,本例中是gbk
import cPickle as mypickle
import re
import sys
if (__name__=='__main__'):
    fid1=file('above50purenames.txt','r');
    p=re.compile('(^\s+|\s+$)');
    phanzigbk=re.compile('[\\x20-\\x7f]');
    phanzi=re.compile(u'[\u4e00-\u9fa5]');#這裡要加u,注意
    commlines=fid1.readlines();
    fid1.close();
    dictfamilyname={};
    dictfirstname={};
    for line in commlines:
        line=p.sub('',line);
        print type(line);
        print line;
        uline=unicode(line,'gbk');
        print type(uline);
        candidates=phanzi.findall(uline);

        print len(candidates);
        if(len(candidates)==2):
            print candidates[0];
            familynamegbk=candidates[0].encode('gbk');#把unicode型的變數變成str型的變數
            firstnamegbk=candidates[1].encode('gbk');
            if(dictfamilyname.has_key(familynamegbk)):
                dictfamilyname[familynamegbk]=dictfamilyname[familynamegbk]+1;
            else:
                dictfamilyname[familynamegbk]=1;
        
            if(dictfirstname.has_key(firstnamegbk)):
                dictfirstname[firstnamegbk]=dictfirstname[firstnamegbk]+1;
            else:
                dictfirstname[firstnamegbk]=1;

    familynameitems=dictfamilyname.items();
    print familynameitems;
    firstnameitems=dictfirstname.items();
    familynameitems.sort(key=lambda d:d[1],reverse=True);
    firstnameitems.sort(key=lambda d :d[1],reverse=True);
    fid=file('familyname.txt','w');
    for m in familynameitems:
        s=m[0]+'\t'+str(m[1]);
        fid.write(s);
        fid.write('\n');
    fid.close();
    fid=file('firstname.txt','w');
    for m in firstnameitems:
        s=m[0]+'\t'+str(m[1]);
        fid.write(s);
        fid.write('\n');
    fid.close();
    print 'finish'
   

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.