pre_file.py
#-*-coding:utf-8-*-import mysqldbimport mysqldb as Mdbimport os,sys,stringimport jiebaimport codecsreload (SYS) Sys.setdefaultencoding (' Utf-8 ') #连接数据库try: conn=mdb.connect (host= ' 127.0.0.1 ', user= ' root ', passwd= ' Kongjunli ' , db= ' test1 ', charset= ' UTF8 ') except exception,e: print e sys.exit () #获取cursor对象操作数据库cursor =conn.cursor ( mdb.cursors.DictCursor) #cursor游标 # Get content sql= ' SELECT link,content from Test1.spider; ' The Cursor.execute (SQL) #execute () method, which takes a string when the command executes Data=cursor.fetchall () #fetchall () receives all the returned result rows F=codecs.open (' C: \ Users\kk\desktop\hello-result1.txt ', ' w ', ' Utf-8 ') for row in data: #row接收结果行的每行数据 seg= '/'. Join (List ( Jieba.cut (row[' content '],cut_all= ' False ')) f.write (row[' link ']+ ' +seg+ ' \ r \ n ') f.close () cursor.close () #提交事务, when inserting data, you must
jiansuo.py
#-*-coding:utf-8-*-import sysimport stringimport mysqldbimport mysqldb as Mdbimport gensimfrom gensim import corpora, Models,similaritiesfrom gensim.similarities Import matrixsimilarityimport loggingimport codecsreload (SYS) Sys.setdefaultencoding (' Utf-8 ') con=mdb.connect (host= ' 127.0.0.1 ', user= ' root ', passwd= ' Kongjunli ', db= ' test1 ', charset= ' UTF8 ') with Con:cur=con.cursor () cur.execute (' SELECT * from Cutresult_copy ') Rows=cur.fetchall () class Mycorp US (object): Def __iter__ (self): to row in Rows:yield str (row[1]). Split ('/') #开启日志logging. Basicconfig (forma t= '% (asctime) s:% (levelname) s:% (message) s ', Level=logging.info) Corp=mycorpus () #将网页文档转化为tf-idfdictionary=corpora. Dictionary (CORP) Corpus=[dictionary.doc2bow (text) for text in Corp] #将文档转化为词袋模型 #print corpustfidf=models. Tfidfmodel (Corpus) #使用tf-IDF model to derive the TF-IDF model of the document Corpus_tfidf=tfidf[corpus] #计算得出tf-IDF value #for doc in CORPUS_TFIDF: #print doc### ' Q_file=open (' C:\Users\kk\Desktop\q.txt ', ' R ') Query=q_file.readline () q_file.close () Vec_bow=dIctionary.doc2bow (Query.split (")) #将请求转化为词带模型vec_tfidf =tfidf[vec_bow] #计算出请求的tf-idf value #for T in VEC_TFIDF: # print T ' # # #query =raw_input (' Enter your query: ') Vec_bow=dictionary.doc2bow (Query.split ()) vec_tfidf=tfidf[vec_bow]index= Similarities. Matrixsimilarity (CORPUS_TFIDF) sims=index[vec_tfidf]similarity=list (Sims) print sorted (similarity,reverse=true)
Encodings.xml
<?xml version= "1.0" encoding= "UTF-8"?>
Misc.xml
<?xml version= "1.0" encoding= "UTF-8"?>
Modules.xml
<?xml version= "1.0" encoding= "UTF-8"?>