Step 1: Build a corpus:
#! /Usr/bin/ENV Python #-*-coding = UTF-8-*-# data source directory (level 2 Directory) sourcedatadir = 'data' # data source file list filelists = [] import osfrom gensim import into a, models, similarities def getsourcefilelists (sourcedatadir): filelists = [] subdirlist = OS. listdir (sourcedatadir) for subdir in subdirlist: sublist = OS. listdir (sourcedatadir + '/' + subdir) filelist = [sourcedatadir + '/' + subdir + '/' + X for X in sublist if OS. path. isfile (S Ourcedatadir + '/' + subdir + '/' + x)] filelists + = filelist return filelists = getsourcefilelists (sourcedatadir) If 0 <Len (filelists ): import codecs import jieba punctuations = ['', '\ n',' \ t ',',','. ',':',';','? ',' (',') ',' [','] ',' & ','! ',' * ',' @ ',' # ',' $ ',' % '] If not OS. path. exists ('dict '): OS. mkdir ("dict") if not OS. path. exists ('corpus'): OS. mkdir ("corpus") for filename in filelists: Print filename hfile = none content = none try: hfile = codecs. open (filename, 'R', 'gb18030') content = hfile. readlines () Does T exception, E: Print e finally: If hfile: hfile. close () If content: filefenci = [X for X in jieba. cut (''. join (content), cut_all = true)] filefenci2 = [word for word in filefenci if not word in punctuations] texts = [filefenci2] all_tokens = sum (texts, []) tokens_once = set (word for word in SET (all_tokens) If all_tokens.count (Word) = 1) texts = [[word for word in text if word not in tokens_once] For text in texts] sfiledir, sfilename = OS. path. split (filename) dictfilename = 'dict/'+ sfilename + '. dict 'corpusfilename = 'Corpus/'+ sfilename + '. MM 'dictionary = Hangzhou. dictionary (texts) dictionary. save_as_text (dictfilename) corpus = ([dictionary.doc 2bow (text) for text in texts]) when. mmcorpus. serialize (corpusfilename, corpus) print 'build corpus done'
Data source:
83 novels from http://d1.txthj.com/newrar/txthj_264.rar are stored in the directory./data.
Processing as a layer-2 Directory during loading
Output:
./Dict and./Corpus
Generate XXX. dict and XXX. mm in the corresponding directory. XXX is the full name of the original file (excluding paths, including suffixes)
Step 2: load the corpus and perform Similarity Analysis
#! /Usr/bin/ENV Python #-*-coding = UTF-8-*-import osfrom gensim import into a, models, similarities def getfilelist (DIR ): return [dir + X for X in OS. listdir (DIR)] dictlists = getfilelist ('. /dict/') Class loaddictionary (object): def _ init _ (self, dictionary): Self. dictionary = dictionary def _ ITER _ (Self): For dictfile in dictlists: sfileraw, sfilepostfix = OS. path. splitext (dictfile) sfiledir, sfilen Ame = OS. path. split (sfileraw) (dictfile, corpusfile) = ('. /dict/'+ sfilename + '. dict ','. /corpus/'+ sfilename + '. MM ') yield self. dictionary. load_from_text (dictfile) Class loadcorpus (object): def _ ITER _ (Self): For dictfile in dictlists: sfileraw, sfilepostfix = OS. path. splitext (dictfile) sfiledir, sfilename = OS. path. split (sfileraw) (dictfile, corpusfile) = ('. /dict/'+ sfilename + '. dict ','. /Corpus/'+ sfilename + '. MM ') yield has. mmcorpus (corpusfile) "preprocessing (easy_install nltk)" # Simplified Chinese + English preprocessing def pre_process_cn (inputs, low_freq_filter = true): "1. remove deprecated word 2. remove punctuation 3. process as stem 4. remove the "Import nltk import jieba. analyze from nltk. tokenize import word_tokenize texts_tokenized = [] for document in inputs: texts_tokenized_tmp = [] for word in word_tokenize (document): texts_tokenized_tmp + = Jieba. analyze. extract_tags (word, 10) texts_tokenized.append (texts_tokenized_tmp) texts_filtered_stopwords = texts_tokenized # Remove punctuation marks english_punctuations = [',','. ',':',';','? ',' (',') ',' [','] ',' & ','! ',' * ',' @ ',' # ',' $ ', '%'] Texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] # stem words from nltk. stem. lancaster import lancasterstemmer ST = lancasterstemmer () texts_stemmed = [[st. stem (Word) for word in docment] For docment in texts_filtered] # Remove over-frequency words if low_freq_filter: all_stems = sum (texts_stemmed, []) stems_once = set (stem fo R stem in SET (all_stems) If all_stems.count (stem) = 1) Texts = [[stem for stem in text if stem not in stems_once] For text in texts_stemmed] else: texts = texts_stemmed return textsdictionary = required. dictionary. dictionary () dictionary_memory_friendly = loaddictionary (dictionary) for vector in dictionary_memory_friendly: dictionary = vectorcorpus = [] corpus_memory_friendly = loadcorpus () for Vector In corpus_memory_friendly: Corpus. append (vector [0]) If 0 <Len (corpus): TFIDF = models. tfidfmodel (corpus) corpus_tfidf = TFIDF [corpus] model = models. lsimodel (corpus_tfidf, id2word = none, num_topics = 20, chunksize = 2000000) # When id2word = dictionary is not specified, lsimodel internally reconstructs dictionary Index = similarities Based on corpus. similarity ('. /novel _ ', model [corpus], num_features = Len (corpus) # the object to be processed is displayed. Here, we randomly extract a paragraph from the novel: Target _ Courses = ['men's faces are heavy and condensing, while women with veil are intermittently crying. They are very focused on looking ahead and witnessing a battle of life and death. '] Target_text = pre_process_cn (target_courses, low_freq_filter = false) "matching similarity of specific objects" # select a benchmark data ml_course = target_text [0] # bag-of-words ml_bow = dictionary.doc 2bow (ml_course) # In the model data LSI model selected above, calculate the similarity between other data and ml_lsi = model [ml_bow] # form of ml_lsi, such as (topic_id, topic_value) sims = index [ml_lsi] # Sims is the final result. Index [XXX] calls the built-in method _ getitem _ () to calculate ml_lsi # sorting, sort_sims = sorted (enumerate (SIMS), Key = Lambda item:-item [1]) # view the result print sort_sims [0: 10] print Len (dictlists) print dictlists [sort_sims [1] [0] print dictlists [sort_sims [2] [0] print dictlists [sort_sims [3] [0]
Note:
Yield is used for better memory efficiency.
Legacy problems:
Step 2 will prompt:
/Usr/lib/python2.7/dist-packages/scipy/sparse/compressed. py: 122: userwarning: indices array has non-integer dtype (float64)
The processing process is not affected.
Nltk-build and use a corpus-Recommendation for novels-complete example