Python---Chinese text classification

Source: Internet
Author: User
Tags readfile

#http://blog.csdn.net/github_36326955/article/details/54891204#comments

#

#-*-coding:utf-8-*-Importimportlib, Sysimportlib.reload (SYS)#cnt = 1"""From lxml import htmldef html2txt (path): With open (path, "RB") as F:content = f.read () page = Html.docume  Nt_fromstring (content) Text = Page.text_content () return textif __name__ = = "__main__": Path = "test.htm" text = Html2txt (path) print (text)""""""Import jiebaseg_list = Jieba.cut ("I came to Beijing Tsinghua University", cut_all=true) print ("Full Mode:" + "/". Join (seg_list)) Seg_list = Jieba.cut ("I came to Tsinghua University in Beijing", Cut_all=false) print ("Default (accurate) Mode:" + "/". Join (seg_list)) Seg_list = Jieba.cut (" He came to NetEase Hang Research building ") print (", ". Join (seg_list)) Seg_list = Jieba.cut_for_search (" Xiao Ming Master graduated from the Institute of Chinese Academy of Sciences, after studying at Kyoto University in Japan ") #搜索引擎模式print (", ". Join (seg_list))"""ImportOSImportJiebajieba.enable_parallel ()defSaveFile (path,content,_encode='Utf-8'): With open (path,'W', encoding=_encode) as F:f.write (content)defReadFile (path,_encode='Utf-8'): With open (path,'R', Encoding=_encode, errors='Ignore') as F:content=F.read ()returncontentdefpreprocess (content,save_path):" "Global cnt if cnt = = 1:print (type content) print (content) cnt + = 1" "content= Content.replace ("\ r \ n","") Content= Content.replace (" ","") content_seg=jieba.cut (content) content_seg=" ". Join (CONTENT_SEG)" "if cnt = = 2:print (type (CONTENT_SEG)) cnt + = 1" "SaveFile (Save_path,"'. Join (CONTENT_SEG))defcorpus_segment (Corpus_path,seg_path): Catelist=Os.listdir (Corpus_path) forSubDirinchCatelist:class_path=Os.path.join (Corpus_path,subdir)#Class_path = Os.path.join (Class_path, "")Cur_seg_path=Os.path.join (Seg_path,subdir)#Seg_path = Os.path.join (Seg_path, "")        if  notos.path.exists (Cur_seg_path): Os.makedirs (Cur_seg_path)if ". Ds_store"  not inchclass_path:file_list=Os.listdir (Class_path) forFileNameinchFile_list:file_path=os.path.join (class_path,filename) content= ReadFile (file_path,_encode='GBK') Save_path=Os.path.join (cur_seg_path,filename) preprocess (" ". Join (content), Save_path)Print("end of Chinese corpus participle")if __name__=="__main__": Corpus_path="/users/k/pycharmprojects/prac/train_corpus"Seg_path="/users/k/pycharmprojects/prac/train_corpus_seg"corpus_segment (corpus_path,seg_path) Corpus_path="/users/k/pycharmprojects/prac/test_corpus"Seg_path="/users/k/pycharmprojects/prac/test_corpus_seg"corpus_segment (Corpus_path,seg_path)"""From sklearn.datasets.base Import bunchbunch = Bunch (target_name=[],lable=[],filenames=[],contents=[])"""

#

ImportOSImportPickle fromSklearn.datasets.baseImportBunch"""' _ ' to enhance readability"""def_readfile (PATH): With open (path,"RB",) as F:content=F.read ()returncontentdefCorpus2bunch (Word_bag_path,seg_path): Catelist=Os.listdir (Seg_path) Bunch= Bunch (target_name=[],label=[],filename=[],contents=[]) catelist= [x forXinchCatelistif "Ds_store"  not inchSTR (x) and "txt"  not inchstr (x)] Bunch.target_name.extend (catelist) forSubDirinchCatelist:class_path=Os.path.join (Seg_path,subdir)#Class_path = Os.path.join (Class_path, "")Filename_list =Os.listdir (Class_path) forFileNameinchFilename_list:filepath=Os.path.join (class_path,filename) bunch.label.append (subdir) bunch.filename.append (filepath) Bunch.contents.append (_readfile (filepath))#Append bytesWith open (Word_bag_path,"WB") as File_obj:pickle.dump (bunch,file_obj)Print("build text Object end! ")if __name__=="__main__": Word_bag_path="/users/k/pycharmprojects/prac/train_word_bag/train_set.dat"Seg_path="/users/k/pycharmprojects/prac/train_corpus_seg"corpus2bunch (word_bag_path,seg_path) Word_bag_path="/users/k/pycharmprojects/prac/test_word_bag/train_set.dat"Seg_path="/users/k/pycharmprojects/prac/test_corpus_seg"corpus2bunch (Word_bag_path,seg_path)

Python---Chinese text classification

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.