Python uses Gensim to calculate document similarity

Source: Internet
Author: User
Tags file url idf
pre_file.py

#-*-coding:utf-8-*-import mysqldbimport mysqldb as Mdbimport os,sys,stringimport jiebaimport codecsreload (SYS) Sys.setdefaultencoding (' Utf-8 ') #连接数据库try:  conn=mdb.connect (host= ' 127.0.0.1 ', user= ' root ', passwd= ' Kongjunli ' , db= ' test1 ', charset= ' UTF8 ') except exception,e:  print e  sys.exit () #获取cursor对象操作数据库cursor =conn.cursor ( mdb.cursors.DictCursor) #cursor游标 # Get content sql= ' SELECT link,content from Test1.spider; ' The Cursor.execute (SQL)   #execute () method, which takes a string when the command executes Data=cursor.fetchall () #fetchall () receives all the returned result rows F=codecs.open (' C: \ Users\kk\desktop\hello-result1.txt ', ' w ', ' Utf-8 ') for row in data:    #row接收结果行的每行数据  seg= '/'. Join (List ( Jieba.cut (row[' content '],cut_all= ' False '))  f.write (row[' link ']+ ' +seg+ ' \ r \ n ') f.close () cursor.close ()      #提交事务, when inserting data, you must

jiansuo.py

#-*-coding:utf-8-*-import sysimport stringimport mysqldbimport mysqldb as Mdbimport gensimfrom gensim import corpora, Models,similaritiesfrom gensim.similarities Import matrixsimilarityimport loggingimport codecsreload (SYS) Sys.setdefaultencoding (' Utf-8 ') con=mdb.connect (host= ' 127.0.0.1 ', user= ' root ', passwd= ' Kongjunli ', db= ' test1 ', charset= ' UTF8 ') with Con:cur=con.cursor () cur.execute (' SELECT * from Cutresult_copy ') Rows=cur.fetchall () class Mycorp US (object): Def __iter__ (self): to row in Rows:yield str (row[1]). Split ('/') #开启日志logging. Basicconfig (forma t= '% (asctime) s:% (levelname) s:% (message) s ', Level=logging.info) Corp=mycorpus () #将网页文档转化为tf-idfdictionary=corpora. Dictionary (CORP) Corpus=[dictionary.doc2bow (text) for text in Corp] #将文档转化为词袋模型 #print corpustfidf=models. Tfidfmodel (Corpus) #使用tf-IDF model to derive the TF-IDF model of the document Corpus_tfidf=tfidf[corpus] #计算得出tf-IDF value #for doc in CORPUS_TFIDF: #print doc### ' Q_file=open (' C:\Users\kk\Desktop\q.txt ', ' R ') Query=q_file.readline () q_file.close () Vec_bow=dIctionary.doc2bow (Query.split (")) #将请求转化为词带模型vec_tfidf =tfidf[vec_bow] #计算出请求的tf-idf value #for T in VEC_TFIDF: # print T ' # # #query =raw_input (' Enter your query: ') Vec_bow=dictionary.doc2bow (Query.split ()) vec_tfidf=tfidf[vec_bow]index= Similarities. Matrixsimilarity (CORPUS_TFIDF) sims=index[vec_tfidf]similarity=list (Sims) print sorted (similarity,reverse=true)

Encodings.xml

<?xml version= "1.0" encoding= "UTF-8"?>
 
   
  
     
   
     
  
   
 
  

Misc.xml

<?xml version= "1.0" encoding= "UTF-8"?>
 
   
  
     
   
      
   
      
   
      
   
      
   
      
   
      
   
      
   
     
  
    
  
   
 
  

Modules.xml

<?xml version= "1.0" encoding= "UTF-8"?>
 
   
  
     
   
       
    
       
   
     
  
   
 
  
  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.