One, install PIP3:
# sudo apt-get install PIP3
Second, install Jieba:
# sudo pip3 install Jieba
Three, install Sklearn:
# sudo pip3 install Scikit-learn
Four, install Sklearn dependent (numpy,scipy):
# sudo pip3 install numpy # sudo pip3 install scipy
Eg: time-out errors may occur in domestic installations---solutions are as follows:
# sudo pip3 install-i https://pypi.tuna.tsinghua.edu.cn/simple numpy # sudo pip3 install-i https://pypi.tuna.tsinghua.edu.cn/simple scipy
Five, simple to implement participle and calculate TF-IDF value:
#!/usr/bin python3.5#Coding=utf-8#Import OSImportJiebaImportRe fromSklearn.feature_extraction.textImportTfidftransformer fromSklearn.feature_extraction.textImportCountvectorizer#Import CollectionsclassTfi (object):def __init__(self): self.stop_list= [] Pass defFenci (self, file):#list = []Fin = open (file,'R') Read_b=Fin.read () fin.close () Read_res="'. Join (Re.findall (U'[a-za-z0-9\u4e00-\u9fa5]+', Read_b)) Cut_res= Jieba.cut (Read_res, cut_all=True) Line_res="' forIinchCut_res:ifI not inchSelf.stop_list:line_res= Line_res + i +' 'Fout= Open ('res/'+ File,'W') Fout.write (line_res) fout.close ()defCipin (Self, fil_list): Corpus= [] forFilinchFil_list:ffout= Open ('res/'+fil,'R') Read_r=Ffout.read () ffout.close () corpus.append (read_r) Vectorizer=Countvectorizer () transformer=Tfidftransformer () TFIDF=Transformer.fit_transform (Vectorizer.fit_transform (corpus)) Word= Vectorizer.get_feature_names ()#keywords for all textWeight =Tfidf.toarray () forJinchRange (len (weight)): F= Open ('fes/'+FIL_LIST[J],'W') forIinchRange (len (word)): F.write (Word[i]+' '+str (Weight[j][i]) +'\ n') F.close ()if __name__=='__main__': First=Tfi () fil_list= ['Inputtext'] First.fenci ('Inputtext') First.cipin (fil_list)
Python3.5 Data processing--jieba + Sklearn library installation and the first example