Gensim Trial Summary

Source: Internet
Author: User
Import Gensim from Gensim import corpora,models from Gensim.corpora import Dictionary from PYLTP import Segmentor corpus= 
        
        [' The situation is changing subtly ', ' to the person who needs the most money ', ' to the best person ', ' to the person who needs it most '] doc_list = [] def segment (): Segmentor = Segmentor () segmentor.load ('/usr/local/ltp_data/cws.model ') for doc in Corpu S:words = List (Segmentor.segment (DOC)) doc_list.append (words) for words in Doc_li
                St:print (words) ' [' Case ', ' positive ', ' happening ', ' ' ', ' ' subtle ', ' ', ' change ']
                [' Give ', ' most ', ' need ', ' money ', ' ', ' people ']
                [' Give ', ' most ', ' excellent ', ' ', ' people ']
                [' Give ', ' mentor ', ' most ', ' need ', ' ', ' people '] "Def test_dictionary (): Dictionary = Dictionary (doc_list) #输出词表, enter a document with good word dictionary.save (' test.dict ' ) #保存到文件夹中 #dictionary = dictionary.load (' test.dict ') print (dictionary) #打印词表 "Dic TioNary (Unique tokens: [' happening ', ' positive ', ' situation ', ' on ', ' People '] ...) ' Print (dictionary.token2id) ' ' {' occurs ': 0, ' positive ': 4, ' case ': 3, ' on ': 6, ' People ': 7, ' money ': 10, ' subtle ': 2, ' excellent
        Show ': 12, ' Give ': 9, ' mentor ': 13, ' ': 5, ' need ': 11, ' Change ': 1, ' most ': 8} ' ' Print (dictionary.get) #由id索到词 "Excellent" ' New_split_doc = [' I ', ' is ', ' one ', ' excellent ', ' excellent ', ' ', ' person '] #新的已分词文档 the_vector_of_n  
        Ew_split_doc = Dictionary.doc2bow (new_split_doc) #新文档的分词向量, words that are not in the glossary will not have a vector representation of print (The_vector_of_new_split_doc) "' [(5, 1), (7, 1), (+ 2)]" ' Corpus_vector = [Dictionary.doc2bow (doc) for Doc in Doc_li                                      ST] #将用于生成词表的文档, used to generate the word vector for Doc_vec in Corpus_vector:print (Doc_vec) #输出每个文档的词向量 (Word ID, frequency of words) "[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)] [(5 , 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)] [(5, 1), (7, 1), (8, 1), (9, 1), (1)] [(5, 1), (7, 1), (8, 1), (9, 1), (one, 1), (+ 1)] "corpora. Mmcorpus.serialize (' corpus_vector.mm ', corpus_vector) #保存文档向量 # corpora. Mmcorpus.serialize (' corpuse.mm ', Corpus) #保存生成的语料 # Corpus=corpora. Mmcorpus (' corpuse.mm ') #加载 def TF_IDF (): Corpus=corpora. Mmcorpus (' corpus_vector.mm ') #加载 dictionary = dictionary.load (' test.dict ') #加载字典 Tfidf_model = models. Tfidfmodel (corpus,id2word=dictionary) #训练IDF模型 print (Tfidf_model) "Tfidfmodel (num_docs=4, num _nnz=24) ' Print (Tfidf_model.dfs) ' {0:1, 1:1, 2:1, 3:1, 4:1, 5:4, 6:1, 7:3, 8
        : 3, 9:3, 10:1, 11:2, 12:1, 13:1} "DOC_SET_TFIDF = Tfidf_model[corpus] #使用IDF模型 Print (DOC_SET_TFIDF) #得到TF-IDF value ' <gensim.interfaces.transformedcorpus object at 0x7f5
   0600d21d0> "for DOC_TFIDF in DOC_SET_TFIDF:             Print (DOC_TFIDF) "[(0, 0.4082482904638631), (1, 0.4082482904638631), (2, 0.408248290463863 1), (3, 0.4082482904638631), (4, 0.4082482904638631), (6, 0.4082482904638631)] [(7, 0.17670342298442518), (8, 0.17 670342298442518), (9, 0.17670342298442518), (10, 0.8515058195534599), (11, 0.42575290977672997)] [(7, 0.1952870421 339958), (8, 0.1952870421339958), (9, 0.1952870421339958), (12, 0.9410573380637679)] [(7, 0.17670342298442518), (8
        , 0.17670342298442518), (9, 0.17670342298442518), (11, 0.42575290977672997), (13, 0.8515058195534599)] " topk=5 print () tfidf_model.save ("DATA.TFIDF") #保存idf模型 New_tfidf_model = models. Tfidfmodel.load (' DATA.TFIDF ') #导入idf模型 DOC_SET_TFIDF2 = New_tfidf_model[corpus] for DOC_TFIDF in Doc_set  _tfidf:keys = [] doc_tfidf = sorted (Doc_tfidf,reverse = True,key = lambda x: (x[1],x[0])) # from large to small arrange for I In range (TopK if Topk<len (DOC_TFIDF) Else Len (DOC_TFIDF)): Keys.append (Dictionary.get (Doc_tfid F[I][0]) keys = "." The Join (keys) print (keys) "The situation is subtle Change money needs to give best to the most people mentors need to give the most "if __name__ = = ' __m Ain__ ': Segment () print () test_dictionary () print () TF_IDF ()

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.