Python uses gensim to calculate document similarity,
Pre_file.py
#-*-Coding: UTF-8-*-import MySQLdbimport MySQLdb as mdbimport OS, sys, stringimport jiebaimport codecsreload (sys) sys. setdefaultencoding ('utf-8') # connect to the database try: conn = mdb. connect (host = '2017. 0.0.1 ', user = 'root', passwd = 'kongjunlil', db = 'test1', charset = 'utf8') failed t Exception, e: print e sys. exit () # obtain the cursor object operation database cursor = conn. cursor (mdb. cursors. dictCursor) # cursor # obtain the content SQL = 'select link, content FROM test1.spider+'cursor.exe cute (SQL) # execute () method, use the string as the command to execute data = cursor. fetchall () # fetchall () receives all returned result rows f = codecs. open ('C: \ Users \ kk \ Desktop \ hello-result1.txt ', 'w', 'utf-8') for row in data: # seg = '/' for each row of the row receiving result row '/'. join (list (jieba. cut (row ['content'], cut_all = 'false') f. write (row ['link'] + ''+ seg + '\ r \ n') f. close () cursor. close () # submit the transaction, which is required when inserting data
Jiansuo. py
#-*-Coding: UTF-8-*-import sysimport stringimport MySQLdbimport MySQLdb as mdbimport gensimfrom gensim import into a, models, similaritiesfrom gensim. similarities import MatrixSimilarityimport loggingimport codecsreload (sys) sys. setdefaultencoding ('utf-8') con = mdb. connect (host = '2017. 0.0.1 ', user = 'root', passwd = 'kongjunlil', db = 'test1', charset = 'utf8') with con: cur = con. cursor () cur.exe cute ('select * FROM cutresult_copy ') rows = cur. fetchall () class MyCorpus (object): def _ iter _ (self): for row in rows: yield str (row [1]). split ('/') # enable log logging. basicConfig (format = '% (asctime) s: % (levelname) s: Drawing 2bow (text) for text in Corp] # convert a document into a bag-of-words model # print corpustfidf = models. tfidfModel (corpus) # Use the tf-idf model to obtain the document's tf-idf model corpus_tfidf = tfidf [corpus] # Calculate the tf-idf value # for doc in corpus_tfidf: # print doc ### '''q _ file = open ('C: \ Users \ kk \ Desktop \ q.txt','your query1_q_file.readline(1_q_file.close({vec_bow1_dictionary.doc 2bow (query. split ('') # convert the request to the word band model vec_tfidf = tfidf [vec_bow] # Calculate the tf-idf value of the Request # for t in vec_tfidf: # print t ''' ### query = raw_input ('enter your query: 'Your vec_bow1_dictionary.doc 2bow (query. split () vec_tfidf = tfidf [vec_bow] index = similarities. matrixSimilarity (corpus_tfidf) sims = index [vec_tfidf] similarity = list (sims) print sorted (similarity, reverse = True)
Encodings. xml
<?xml version="1.0" encoding="UTF-8"?><project version="4"> <component name="Encoding"> <file url="PROJECT" charset="UTF-8" /> </component></project>
Misc. xml
<?xml version="1.0" encoding="UTF-8"?><project version="4"> <component name="ProjectLevelVcsManager" settingsEditedManually="false"> <OptionsSetting value="true" id="Add" /> <OptionsSetting value="true" id="Remove" /> <OptionsSetting value="true" id="Checkout" /> <OptionsSetting value="true" id="Update" /> <OptionsSetting value="true" id="Status" /> <OptionsSetting value="true" id="Edit" /> <ConfirmationsSetting value="0" id="Add" /> <ConfirmationsSetting value="0" id="Remove" /> </component> <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.11 (C:\Python27\python.exe)" project-jdk-type="Python SDK" /></project>
Modules. xml
<? Xml version = "1.0" encoding = "UTF-8"?> <Project version = "4"> <component name = "ProjectModuleManager"> <modules> <module fileurl = "file: // $ PROJECT_DIR $ /. idea/crawler exercise code. iml "filepath =" $ PROJECT_DIR $ /. idea/crawler exercise code. iml "/> </modules> </component> </project>
Articles you may be interested in:
- How to find similar words in Python
- Implementation of function calling methods with similar structures in Python
- How Python compares similarity between two images
- Using Python to implement simple similar image search tutorials
- Recognition of similar images using python