Import Gensim from Gensim import corpora,models from Gensim.corpora import Dictionary from PYLTP import Segmentor corpus=
[' The situation is changing subtly ', ' to the person who needs the most money ', ' to the best person ', ' to the person who needs it most '] doc_list = [] def segment (): Segmentor = Segmentor () segmentor.load ('/usr/local/ltp_data/cws.model ') for doc in Corpu S:words = List (Segmentor.segment (DOC)) doc_list.append (words) for words in Doc_li
St:print (words) ' [' Case ', ' positive ', ' happening ', ' ' ', ' ' subtle ', ' ', ' change ']
[' Give ', ' most ', ' need ', ' money ', ' ', ' people ']
[' Give ', ' most ', ' excellent ', ' ', ' people ']
[' Give ', ' mentor ', ' most ', ' need ', ' ', ' people '] "Def test_dictionary (): Dictionary = Dictionary (doc_list) #输出词表, enter a document with good word dictionary.save (' test.dict ' ) #保存到文件夹中 #dictionary = dictionary.load (' test.dict ') print (dictionary) #打印词表 "Dic TioNary (Unique tokens: [' happening ', ' positive ', ' situation ', ' on ', ' People '] ...) ' Print (dictionary.token2id) ' ' {' occurs ': 0, ' positive ': 4, ' case ': 3, ' on ': 6, ' People ': 7, ' money ': 10, ' subtle ': 2, ' excellent
Show ': 12, ' Give ': 9, ' mentor ': 13, ' ': 5, ' need ': 11, ' Change ': 1, ' most ': 8} ' ' Print (dictionary.get) #由id索到词 "Excellent" ' New_split_doc = [' I ', ' is ', ' one ', ' excellent ', ' excellent ', ' ', ' person '] #新的已分词文档 the_vector_of_n
Ew_split_doc = Dictionary.doc2bow (new_split_doc) #新文档的分词向量, words that are not in the glossary will not have a vector representation of print (The_vector_of_new_split_doc) "' [(5, 1), (7, 1), (+ 2)]" ' Corpus_vector = [Dictionary.doc2bow (doc) for Doc in Doc_li ST] #将用于生成词表的文档, used to generate the word vector for Doc_vec in Corpus_vector:print (Doc_vec) #输出每个文档的词向量 (Word ID, frequency of words) "[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)] [(5 , 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)] [(5, 1), (7, 1), (8, 1), (9, 1), (1)] [(5, 1), (7, 1), (8, 1), (9, 1), (one, 1), (+ 1)] "corpora. Mmcorpus.serialize (' corpus_vector.mm ', corpus_vector) #保存文档向量 # corpora. Mmcorpus.serialize (' corpuse.mm ', Corpus) #保存生成的语料 # Corpus=corpora. Mmcorpus (' corpuse.mm ') #加载 def TF_IDF (): Corpus=corpora. Mmcorpus (' corpus_vector.mm ') #加载 dictionary = dictionary.load (' test.dict ') #加载字典 Tfidf_model = models. Tfidfmodel (corpus,id2word=dictionary) #训练IDF模型 print (Tfidf_model) "Tfidfmodel (num_docs=4, num _nnz=24) ' Print (Tfidf_model.dfs) ' {0:1, 1:1, 2:1, 3:1, 4:1, 5:4, 6:1, 7:3, 8
: 3, 9:3, 10:1, 11:2, 12:1, 13:1} "DOC_SET_TFIDF = Tfidf_model[corpus] #使用IDF模型 Print (DOC_SET_TFIDF) #得到TF-IDF value ' <gensim.interfaces.transformedcorpus object at 0x7f5
0600d21d0> "for DOC_TFIDF in DOC_SET_TFIDF: Print (DOC_TFIDF) "[(0, 0.4082482904638631), (1, 0.4082482904638631), (2, 0.408248290463863 1), (3, 0.4082482904638631), (4, 0.4082482904638631), (6, 0.4082482904638631)] [(7, 0.17670342298442518), (8, 0.17 670342298442518), (9, 0.17670342298442518), (10, 0.8515058195534599), (11, 0.42575290977672997)] [(7, 0.1952870421 339958), (8, 0.1952870421339958), (9, 0.1952870421339958), (12, 0.9410573380637679)] [(7, 0.17670342298442518), (8
, 0.17670342298442518), (9, 0.17670342298442518), (11, 0.42575290977672997), (13, 0.8515058195534599)] " topk=5 print () tfidf_model.save ("DATA.TFIDF") #保存idf模型 New_tfidf_model = models. Tfidfmodel.load (' DATA.TFIDF ') #导入idf模型 DOC_SET_TFIDF2 = New_tfidf_model[corpus] for DOC_TFIDF in Doc_set _tfidf:keys = [] doc_tfidf = sorted (Doc_tfidf,reverse = True,key = lambda x: (x[1],x[0])) # from large to small arrange for I In range (TopK if Topk<len (DOC_TFIDF) Else Len (DOC_TFIDF)): Keys.append (Dictionary.get (Doc_tfid F[I][0]) keys = "." The Join (keys) print (keys) "The situation is subtle Change money needs to give best to the most people mentors need to give the most "if __name__ = = ' __m Ain__ ': Segment () print () test_dictionary () print () TF_IDF ()