#http://blog.csdn.net/github_36326955/article/details/54891204#comments
#
#-*-coding:utf-8-*-Importimportlib, Sysimportlib.reload (SYS)#cnt = 1"""From lxml import htmldef html2txt (path): With open (path, "RB") as F:content = f.read () page = Html.docume Nt_fromstring (content) Text = Page.text_content () return textif __name__ = = "__main__": Path = "test.htm" text = Html2txt (path) print (text)""""""Import jiebaseg_list = Jieba.cut ("I came to Beijing Tsinghua University", cut_all=true) print ("Full Mode:" + "/". Join (seg_list)) Seg_list = Jieba.cut ("I came to Tsinghua University in Beijing", Cut_all=false) print ("Default (accurate) Mode:" + "/". Join (seg_list)) Seg_list = Jieba.cut (" He came to NetEase Hang Research building ") print (", ". Join (seg_list)) Seg_list = Jieba.cut_for_search (" Xiao Ming Master graduated from the Institute of Chinese Academy of Sciences, after studying at Kyoto University in Japan ") #搜索引擎模式print (", ". Join (seg_list))"""ImportOSImportJiebajieba.enable_parallel ()defSaveFile (path,content,_encode='Utf-8'): With open (path,'W', encoding=_encode) as F:f.write (content)defReadFile (path,_encode='Utf-8'): With open (path,'R', Encoding=_encode, errors='Ignore') as F:content=F.read ()returncontentdefpreprocess (content,save_path):" "Global cnt if cnt = = 1:print (type content) print (content) cnt + = 1" "content= Content.replace ("\ r \ n","") Content= Content.replace (" ","") content_seg=jieba.cut (content) content_seg=" ". Join (CONTENT_SEG)" "if cnt = = 2:print (type (CONTENT_SEG)) cnt + = 1" "SaveFile (Save_path,"'. Join (CONTENT_SEG))defcorpus_segment (Corpus_path,seg_path): Catelist=Os.listdir (Corpus_path) forSubDirinchCatelist:class_path=Os.path.join (Corpus_path,subdir)#Class_path = Os.path.join (Class_path, "")Cur_seg_path=Os.path.join (Seg_path,subdir)#Seg_path = Os.path.join (Seg_path, "") if notos.path.exists (Cur_seg_path): Os.makedirs (Cur_seg_path)if ". Ds_store" not inchclass_path:file_list=Os.listdir (Class_path) forFileNameinchFile_list:file_path=os.path.join (class_path,filename) content= ReadFile (file_path,_encode='GBK') Save_path=Os.path.join (cur_seg_path,filename) preprocess (" ". Join (content), Save_path)Print("end of Chinese corpus participle")if __name__=="__main__": Corpus_path="/users/k/pycharmprojects/prac/train_corpus"Seg_path="/users/k/pycharmprojects/prac/train_corpus_seg"corpus_segment (corpus_path,seg_path) Corpus_path="/users/k/pycharmprojects/prac/test_corpus"Seg_path="/users/k/pycharmprojects/prac/test_corpus_seg"corpus_segment (Corpus_path,seg_path)"""From sklearn.datasets.base Import bunchbunch = Bunch (target_name=[],lable=[],filenames=[],contents=[])"""
#
ImportOSImportPickle fromSklearn.datasets.baseImportBunch"""' _ ' to enhance readability"""def_readfile (PATH): With open (path,"RB",) as F:content=F.read ()returncontentdefCorpus2bunch (Word_bag_path,seg_path): Catelist=Os.listdir (Seg_path) Bunch= Bunch (target_name=[],label=[],filename=[],contents=[]) catelist= [x forXinchCatelistif "Ds_store" not inchSTR (x) and "txt" not inchstr (x)] Bunch.target_name.extend (catelist) forSubDirinchCatelist:class_path=Os.path.join (Seg_path,subdir)#Class_path = Os.path.join (Class_path, "")Filename_list =Os.listdir (Class_path) forFileNameinchFilename_list:filepath=Os.path.join (class_path,filename) bunch.label.append (subdir) bunch.filename.append (filepath) Bunch.contents.append (_readfile (filepath))#Append bytesWith open (Word_bag_path,"WB") as File_obj:pickle.dump (bunch,file_obj)Print("build text Object end! ")if __name__=="__main__": Word_bag_path="/users/k/pycharmprojects/prac/train_word_bag/train_set.dat"Seg_path="/users/k/pycharmprojects/prac/train_corpus_seg"corpus2bunch (word_bag_path,seg_path) Word_bag_path="/users/k/pycharmprojects/prac/test_word_bag/train_set.dat"Seg_path="/users/k/pycharmprojects/prac/test_corpus_seg"corpus2bunch (Word_bag_path,seg_path)
Python---Chinese text classification