Article from my personal blog: python participle calculation document TF-IDF value and sort
The function of the program is: first read some documents, and then through the Jieba to the word segmentation, the word segmentation into the file, and then through the Sklearn calculation of each word in the document TF-IDF value, and then the document sorted into a large file
Dependent Packages:
Sklearn
Jieba
Note: This procedure has been changed after a companion program has been modified
# -*- coding: utf-8 -*-"" "@author: jiangfuqiang" "" import osimport jiebaimport jieba.posseg as psegimport sysimport reimport timeimport stringfrom sklearn import feature_extractionfrom sklearn.feature_extraction.text Import tfidftransformerfrom sklearn.feature_extraction.text import countvectorizerreload ( SYS) sys.setdefaultencoding (' Utf-8 ') def getfilelist (path): filelist = [] files = os.listdir (PATH) for f in files: if f[0] == '. ': pass else: filelist.append (f) return filelist,pathdef fencI (Filename,path,segpath): f = open (path + "/" + filename, ' r+ ') file_list = f.read () f.close () #保存粉刺结果的文件夹 if not os.path.exists (segpath): os.mkdir (Segpath) #对文档进行分词处理 seg_ List = jieba.cut (file_list,cut_all=true) #对空格. Line break for processing result = [] for seg in seg_list : seg = '. Join (Seg.split ()) reg = ' w+ ' r = Re.search (reg,seg) if seg != ' and seg != ' and seg != ' and seg != ' = ' and seg != ' [' and seg != ' and seg != ' (' and seg != ') ' and not r: result.append (SEG) #将分词后的结果用空格隔开, save to local f = open (segpath+ "/" +filename+ "- Seg.txt "," w+ ") f.write (' '. Join (Result) f.close () # Read a document that has been participle-ready. Perform TF-IDF calculations DEF TFIDF (filelist,sfilepath,path): corpus = [] for ff in filelist: fname = path + ff f = open (fname+ "-seg.txt", ' r+ ') &nbsP; content = f.read () f.close ( ) corpus.append (content) vectorizer = countvectorizer () transformer = tfidftransformer () tfidf = transformer.fit_transform (Vectorizer.fit_transform (corpus)) word = vectorizer.get_feature_names () #全部文本的关键字 weight = tfidf.toarray () if not os.path.exists (SFilePath): os.mkdir (Sfilepath) for i in range (Len ( Weight)): print u '----------writing all the tf-idf in the ', I,u ' file into ', sfilepath+ '/' +string.zfill (i,5) + ". TXT " &nbsP; f = open (sfilepath+ "/" +string.zfill (i,5) + ". txt", ' w+ ') for j in range (len (word)): f.write (word[j] + " " + str (Weight[i][j]) + "") f.close () if __name__ == "__main__": #保存tf-IDF The calculated results folder sFilePath = "/home/lifeix/soft/ Allfile/tfidffile "+str (Time.time ()) #保存分词的文件夹 segpath = '/home/lifeix/soft/allfile/segfile ' (allfile,path) = getfilelist ('/home /lifeix/soft/allkeyword ') for ff in allfile: print "using jieba on " + ff fenci (Ff,patH,segpath)     TFIDF (allfile,sfilepath,segpath) #对整个文档进行排序 os.system ("sort -nrk 2 " + sfilepath+ "/*.txt >" + sFilePath + "/sorted.txt")
Python participle calculation document TF-IDF value and sort