標籤:transform end board .text style str file gpo blank
文章來自於我的個人部落格:python 分詞計算文檔TF-IDF值並排序
該程式實現的功能是:首先讀取一些文檔,然後通過jieba來分詞,將分詞存入檔案,然後通過sklearn計算每一個分詞文檔中的tf-idf值,再將文檔排序輸入一個大檔案裡
依賴包:
sklearn
jieba
註:此程式參考了一位同行的程式後進行了改動
# -*- coding: utf-8 -*-"""@author: jiangfuqiang"""import osimport jiebaimport jieba.posseg as psegimport sysimport reimport timeimport stringfrom sklearn import feature_extractionfrom sklearn.feature_extraction.text import TfidfTransformerfrom sklearn.feature_extraction.text import CountVectorizerreload(sys)sys.setdefaultencoding(‘utf-8‘)def getFileList(path): filelist = [] files = os.listdir(path) for f in files: if f[0] == ‘.‘: pass else: filelist.append(f) return filelist,pathdef fenci(filename,path,segPath): f = open(path +"/" + filename,‘r+‘) file_list = f.read() f.close() #儲存粉刺結果的檔案夾 if not os.path.exists(segPath): os.mkdir(segPath) #對文檔進行分詞處理 seg_list = jieba.cut(file_list,cut_all=True) #對空格。分行符號進行處理 result = [] for seg in seg_list: seg = ‘‘.join(seg.split()) reg = ‘w+‘ r = re.search(reg,seg) if seg != ‘‘ and seg != ‘‘ and seg != ‘‘ and seg != ‘=‘ and seg != ‘[‘ and seg != ‘]‘ and seg != ‘(‘ and seg != ‘)‘ and not r: result.append(seg) #將分詞後的結果用空格隔開,儲存至本地 f = open(segPath+"/"+filename+"-seg.txt","w+") f.write(‘ ‘.join(result)) f.close()#讀取已經分詞好的文檔。進行TF-IDF計算def Tfidf(filelist,sFilePath,path): corpus = [] for ff in filelist: fname = path + ff f = open(fname+"-seg.txt",‘r+‘) content = f.read() f.close() corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() #全部文本的關鍵字 weight = tfidf.toarray() if not os.path.exists(sFilePath): os.mkdir(sFilePath) for i in range(len(weight)): print u‘----------writing all the tf-idf in the ‘,i,u‘file into ‘, sFilePath+‘/‘ +string.zfill(i,5)+".txt" f = open(sFilePath+"/"+string.zfill(i,5)+".txt",‘w+‘) for j in range(len(word)): f.write(word[j] + " " + str(weight[i][j]) + "") f.close()if __name__ == "__main__": #儲存tf-idf的計算結果檔案夾 sFilePath = "/home/lifeix/soft/allfile/tfidffile"+str(time.time()) #儲存分詞的檔案夾 segPath = ‘/home/lifeix/soft/allfile/segfile‘ (allfile,path) = getFileList(‘/home/lifeix/soft/allkeyword‘) for ff in allfile: print "Using jieba on " + ff fenci(ff,path,segPath) Tfidf(allfile,sFilePath,segPath) #對整個文檔進行排序 os.system("sort -nrk 2 " + sFilePath+"/*.txt >" + sFilePath + "/sorted.txt")
python 分詞計算文檔TF-IDF值並排序