The text similarity is computed using Sklearn, and the similarity matrix between the text is saved to the file. This extracts the text TF-IDF eigenvalues to calculate the similarity of the text.
#!/usr/bin/python #-*-Coding:utf-8-*-import numpyimport osimport sysfrom sklearn import Feature_extractionfrom Sklea Rn.feature_extraction.text Import tfidftransformerfrom sklearn.feature_extraction.text import Tfidfvectorizer, Countvectorizerreload (SYS) #sys. setdefaultencoding (' UTF8 ') netease Hang Research Building "," Xiao Ming Master graduated with the Chinese Academy of Sciences, "I love Beijing Tian ' an door"]trainfile = Open ("C: \\Users\\hd\\Desktop\\docs.txt "," r ") #不同的documents用换行符隔开traincorpus = Trainfile.readlines () #corpus =[" I came to Tsinghua University in Beijing, " I he came to trainfile.close () corpus = Traincorpus; Vectorizer=countvectorizer () #该类会将文本中的词语转换为词频矩阵, matrix element a[i][j] denotes the word frequency of J words under Class I text Tfidf_vectorizer = Tfidfvectorizer (max_df =0.95, min_df=2, #max_features =n_features, stop_words= ' 中文版 ') TRANSFORMER=TFIDFTRANSFO Rmer () #该类会统计每个词语的tf-idf weight value tfidf=transformer.fit_transform (Tfidf_vectorizer.fit_transform (corpus)) #第一个fit_ Transform is calculated TF-IDF, the second fit_transform is to convert the text to a word frequency matrix word=tfidf_vectorizer.get_feature_names () #获取词袋模型中的所有词语weight = Tfidf.toarray () #将tf-IDF matrix extraction, Element a[i][J] represents the TF-IDF weight f = open ("C:\\users\\hd\\desktop\\tif.txt", "w+") for the J Word in Class I text, and the For I in range (len (weight)): # Print the TF-IDF word weights for each type of text, the first for traversing all the text, the second for facilitating the word weight under a certain type of text # print U "-------here output the word", I,u "class text TF-IDF weight------" f.write (str (i+1) + "\ T") for J in range (Len (word)): if (weight[i][j]>0): F.write (str (j+1) + ":" + str (WEIGHT[I][J]) + "") F.write ("\ n") print if.close () F = open ("C:\\users\\hd\\desktop\\dictionary.txt", "w+") for I in range (len (word)): F . write (str (i) + "\ T" + word[i].encode ("utf-8") + "\ n") f.close () Simmatrix = (TFIDF * TFIDF. T). Aprint simmatrix[1,3] # "The first and 4th of the similarity" Numpy.savetxt ("C:\\users\\hd\\desktop\\simmatrix.csv", Simmatrix, delimiter= ", ") #保存相似度矩阵
Computation of text similarity using Sklearn