In text processing, for example, product comment mining, you sometimes need to know the similarity between each comment and the description of the item, so as to measure the objectivity of the comment. Is there a program for calculating Text Similarity in python? Congratulations, not only is it, but it is very powerful. Next we will try gensim's powerful pre_file.py
#-*-Coding: UTF-8-*-import MySQLdbimport MySQLdb as mdbimport OS, sys, stringimport jiebaimport codecsreload (sys) sys. setdefaultencoding ('utf-8') # connect to the database try: conn = mdb. connect (host = '2017. 0.0.1 ', user = 'root', passwd = 'kongjunlil', db = 'test1', charset = 'utf8') failed t Exception, e: print e sys. exit () # obtain the cursor object operation database cursor = conn. cursor (mdb. cursors. dictCursor) # cursor # obtain the content SQL = 'select link, content FROM test1.spider+'cursor.exe cute (SQL) # execute () method, use the string as the command to execute data = cursor. fetchall () # fetchall () receives all returned result rows f = codecs. open ('C: \ Users \ kk \ Desktop \ hello-result1.txt ', 'w', 'utf-8') for row in data: # seg = '/' for each row of the row receiving result row '/'. join (list (jieba. cut (row ['content'], cut_all = 'false') f. write (row ['link'] + ''+ seg + '\ r \ n') f. close () cursor. close () # submit the transaction, which is required when inserting data
Jiansuo. py
#-*-Coding: UTF-8-*-import sysimport stringimport MySQLdbimport MySQLdb as mdbimport gensimfrom gensim import into a, models, similaritiesfrom gensim. similarities import MatrixSimilarityimport loggingimport codecsreload (sys) sys. setdefaultencoding ('utf-8') con = mdb. connect (host = '2017. 0.0.1 ', user = 'root', passwd = 'kongjunlil', db = 'test1', charset = 'utf8') with con: cur = con. cursor () cur.exe cute ('select * FROM cutresult_copy ') rows = cur. fetchall () class MyCorpus (object): def _ iter _ (self): for row in rows: yield str (row [1]). split ('/') # enable log logging. basicConfig (format = '% (asctime) s: % (levelname) s: Drawing 2bow (text) for text in Corp] # convert a document into a bag-of-words model # print corpustfidf = models. tfidfModel (corpus) # Use the tf-idf model to obtain the document's tf-idf model corpus_tfidf = tfidf [corpus] # Calculate the tf-idf value # for doc in corpus_tfidf: # print doc ### '''q _ file = open ('C: \ Users \ kk \ Desktop \ q.txt','your query1_q_file.readline(1_q_file.close({vec_bow1_dictionary.doc 2bow (query. split ('') # convert the request to the word band model vec_tfidf = tfidf [vec_bow] # Calculate the tf-idf value of the Request # for t in vec_tfidf: # print t ''' ### query = raw_input ('enter your query: 'Your vec_bow1_dictionary.doc 2bow (query. split () vec_tfidf = tfidf [vec_bow] index = similarities. matrixSimilarity (corpus_tfidf) sims = index [vec_tfidf] similarity = list (sims) print sorted (similarity, reverse = True)
Encodings. xml
<?xml version="1.0" encoding="UTF-8"?>
Misc. xml
<?xml version="1.0" encoding="UTF-8"?>
Modules. xml
<? Xml version = "1.0" encoding = "UTF-8"?>