Python for text preprocessing (text segmentation, filter stop words, Word frequency statistics, feature selection, text representation) __python

Source: Internet
Author: User
Tags idf

System: Win7 32 bits

Word segmentation Software: Pynlpir

Integrated development Environment (IDE): Pycharm


Function: To realize the whole process of multilevel text preprocessing, including text segmentation, filter stop words, Word frequency statistics, feature selection, text representation, and export the results to the. Arff format that Weka can handle.


Directly on the code:


#!/usr/bin/env python #-*-coding:utf-8-*-"" "Function: Pynlpir text preprocessing process: text participle, filter stop words, Word frequency statistics, feature selection, text expression time: August 25, 2016 10:52:43" "" Import pynlpir Import codecs import math Pynlpir.open () #文本分词 typelist = [u "Finance", U "IT", U "Health", U "Sports", U "Travel", U "Education", U "Recruit", U "Wen" ", U" Military "] Typetxt = Codecs.open (' C:\\users\\administrator\\desktop\\txttype.txt ', ' a ', encoding= ' utf-8 ') wordseg_
result = Codecs.open (' C:\\users\\administrator\\desktop\\wordseg_result.txt ', ' a ', encoding= ' utf-8 ') Allresult = [] For j in Range (1,10): For I in Range (10,510): Typetxt.write (typelist[j-1] + "\ n") s = "" Sing Letext_result = [] Print (U ' is working on the word%s of the%s folder ... '% (j,i)) F = Codecs.open (' C:\\users\\administ Rator\\desktop\\textmining_experiment2\\word segment\\traintxt500\\%d\\%d.txt '% (j,i), ' R ', ' GB18030 ') for line in F:s + = Line.strip (). Encode (' Utf-8 ') for item in Pynlpir.segment (s): Singletext_result.ap Pend (item[0]) allresult.append(Singletext_result) Typetxt.close () Print (U ' text category is finished. The results have been exported to the desktop Txttype.txt. ') #直接打印出结果 #for singletext_result in Allresult: # for item in Singletext_result: # Print Item #所有结果写入一个txt, one line Text for Singletext_result in Allresult:for item in Singletext_result:wordseg_result.write (item+ ' t ') word Seg_result.write (' \ n ') Wordseg_result.close () Print (U ') is finished. The word segmentation result has been exported to the desktop Wordseg_result.txt. ' + ' \ n ') #过滤停用词 stopwords = [] Delstopwords_alltxt = [] st = Codecs.open (' c:\\users\\administrator\\pycharmprojects\\new Textmining\\file\\stopwords.txt ', ' RB ', encoding= ' utf-8 ') Delstopwords_result = Codecs.open (' c:\\users\\ Administrator\\desktop\\delstopwords_result.txt ', ' a ', encoding= ' utf-8 ') for line in st:line = Line.strip () stop Words.append (line) print (U ' filtering deactivated words ... ') for singletext_result in allresult:delstopwords_singletxt = [] for Word in Singletext_result:word = Word.strip () If Word isn't in Stopwords:if word &Gt;= u ' \u4e00 ' and word <= u ' \u9fa5 ': #判断是否是汉字 delstopwords_singletxt.append (word) delstopwords_alltx T.append (Delstopwords_singletxt) for delstopwords_singletxt in Delstopwords_alltxt:for Everyword in Delstopwords_sin Gletxt:delstopwords_result.write (Everyword + ' \ t ') delstopwords_result.write (' \ n ') Delstopwords_result.close ( Print (U ' stop word filter complete.) The results have been exported to the desktop Delstopwords_result.txt. ' + ' \ n ') #统计绝对词频统计TF gettf_alltxt_dic = {} Gettf_result = Codecs.open (' C:\\users\\administrator\\desktop\\gettf_result . txt ', ' a ', encoding= ' Utf-8 ') print (U ' is counting tf ... ') for delstopwords_singletxt in Delstopwords_alltxt:gettf_singl Etxt_dic = {} for Everyword in Delstopwords_singletxt:everyword = Everyword.strip () if Everyword in G Ettf_singletxt_dic:gettf_singletxt_dic[everyword] + 1 Else:gettf_singletxt_dic[everywor D] = 1 Gettf_singletxt_dic = sorted (Gettf_singletxt_dic.items (), Key=lambda d:d[1],reverse=1) for A, b in gettf_singletxt_dic:if B > 0:gettf_result.write (A + ' t ' + str (b) + ' \ t ') gettf_result.write (' \ n ') Gettf_result.close () Print (U ' tf value is complete. The results have been exported to the desktop Gettf_result.txt. ' + ' \ n ') #特征选择 #计算所有类别DF alltext = [] Allwords = [] Delstopwords_result = Codecs.open (' C:\\users\\administrator\\deskto P\\delstopwords_result.txt ', ' RB ', encoding= ' utf-8 ') wordlist = [] for line in Delstopwords_result:alltext.append (li NE) words = line.strip (' \ n '). Split (' \ t ') for word in words:if word:wordlist.append (word) PR int U "Original text vocabulary total:", Len (wordlist) Print U "number of text:", Len (alltext) Print U "Vocabulary:", Len (set (wordlist)) print (' \ n ' +u ' is calculating all categories DF
            ... ') WORD_DF = [] for word in Set (wordlist): Count = 0 for words in Alltext:if word in words: Count + = 1 Word_df.append ([Word, str (count)]) # storage form [WORD,DF] # output Word_df.sort (Key=lambda x:int (x[1)), reverse =true) # Frequency from large to small sort B = codecs.open (' c:\\uSers\\administrator\\desktop\\df_allclass_result.txt ', "a", encoding= "Utf-8") b.truncate () for item in Word_df:for wo Rd in Item:b.write (word+ ' \ t ') b.write (' \ n ') b.close () b = Codecs.open (' c:\\users\\administrator\\desktop\\
        Df_allclass_result.txt ', "RB", encoding= "Utf-8") for line in b:line = Line.split (' \ t ') If Len (line[0)) >1: c = Codecs.open (' C:\\users\\administrator\\desktop\\df_allclass_result1.txt ', "a", encoding= "Utf-8") C.write ( line[0]+ ' \ t ' +line[1]) c.write (' \ n ') C.close () Print (U ') all categories DF values are complete. The results have been exported to the desktop Df_allclass_result1.txt. ' + ' \ n ') #特征选择 #计算单一类别DF print (U ' is calculating single category df ... ') WORD_DF2 = [] D = codecs.open (' c:\\users\\administrator\\desktop\\ Df_allclass_result1.txt ', "RB", encoding= "Utf-8") for line in d:line = Line.split () word = line[0] Count_1 = 0 Count_2 = 0 count_3 = 0 Count_4 = 0 count_5 = 0 count_6 = 0 count_7 = 0 count_8 = 0 Co Unt_9 = 0 for words in allTEXT[0:500]: If Word in Words:count_1 + + 1 for words in alltext[500:1000]: if Word in wo

    Rds:count_2 + 1 for words into alltext[1000:1500]: if Word in words:count_3 = 1 For words in alltext[1500:2000]: if Word in words:count_4 = 1 for words in alltext[2000:2500]
            : If Word in Words:count_5 + + 1 for words into alltext[2500:3000]: If word in words: Count_6 = 1 for words into alltext[3000:3500]: if Word in words:count_7 + = 1 for Word
        S in alltext[3500:4000]: if line[0] in words:count_8 = 1 for words in alltext[4000:4500]: If Word in Words:count_9 = 1 Word_df2.append ([Word,str (count_1), str (count_2), str (count_3), str (count_ 4), str (count_5), str (count_6), str (count_7), str (count_8), str (count_9)) # storage form [WORD,DF] D.close () # Output E = Codecs.open (' C:\\users\\administrator\\desktop\\df_singleclass_result.txt ', "a", encoding= "Utf-8") for item in Word_df2:for term in item:e.write (t erm+ ' \ t ') e.write (' \ n ') E.close () Print (U ' single category df value is complete. The results have been exported to the desktop Df_singleclass_result.txt. ' + ' \ n ') #计算特征项信息熵 print (U ' is calculating information entropy ... ') IG = [] G = codecs.open (' C:\\users\\administrator\\desktop\\df_allclass_resu
    Lt1.txt ', RB, encoding= "Utf-8") for line in g:line = Line.split () word = line[0] Word2 = float (line[1)) PC = float (+)/float (4500) pc_1 = float (PC) * 9 Entropy =-(float (pc_1) *float (Math.log (pc_1,2))) PT = float ( WORD2)/float (len (alltext)) pt_1 = float (1)-PT h = codecs.open (' C:\\users\\administrator\\desktop\\df_singleclas S_result.txt ', RB, encoding= "Utf-8") lines = H.readline () line = lines[:-1] line = Line.split () pct_even Plus = (float (line[1]) + float (line[2]) + float (line[3]) + float (line[4)) + float (line[5)) + float (line[6)) + float (line[ 7]) + float (line[8])/float (WORD2) pct_Evenplus_ = float (1)-Float (pct_evenplus) E1 =-(Float (pct_evenplus) * Float (Math.log (Pct_evenplus, 2)) E2 =-( Float (pct_evenplus_) * Float (Math.log (float (pct_evenplus), 2)) Exetropy =float (PT) * FLOAT (E1) + float (pt_1) * FLOAT (E 2) Ig_value = float (Entropy)-Float (exetropy) ig.append ([Word, str (ig_value)]) # storage form [Word,ig_value] Ig.sort (k Ey=lambda x:float (x[1]), reverse=true) # frequency from large to small sort i = codecs.open (' C:\\users\\administrator\\desktop\\ig_value.txt ', " A ", encoding=" Utf-8 ") i.truncate () for item in Ig:for Word in item:i.write (word + ' \ t ') i.write (' \ n ') i. Close () H.close () G.close () Print (U ') information gain value is complete. The results have been exported to the desktop Ig_value.txt. ' + ' \ n ') Print (U ' is selecting feature words ... ') j = codecs.open (' c:\\users\\administrator\\desktop\\ig_value.txt ', ' RB ', encoding= ') Utf-8 ") for line in j:line = Line.split () if float (line[1)) >-float (a): K = Codecs.open (' c:\\users\\a Dministrator\\desktop\\featurewords.txt ', "a", encoding= "Utf-8") K.WRITE (line[0]) k.write (' \ n ') k.close () J.close () Print (U ' feature word selection completed. The results have been exported to the desktop FeatureWords.txt. ' + ' \ n ') f1 = Codecs.open (' c:\\users\\administrator\\desktop\\df_allclass_result1.txt ', ' RB ', encoding= ' utf-8 ') F2 = Codecs.open (' C:\\users\\administrator\\desktop\\featurewords.txt ', "RB", encoding= "Utf-8") Featurewords_value = Codecs.open (' C:\\users\\administrator\\desktop\\featurewords_value.txt ', "a", encoding= "Utf-8") dic = {} for line in F1
        : line =line.strip (' \ n '). Split (' \ t ') dic[line[0]]=line[1] f1.close () for word in F2:if word in dic: Featurewords_value.write (dic[word]+ ' \ t ' +dic[word]+ ' \ n ') f2.close () featurewords_value.close () #文档向量化 print ( U ' text to quantization ... ') F1 = Codecs.open (' c:\\users\\administrator\\desktop\\df_allclass_result1.txt ', ' RB ', encoding
= "Utf-8") F2 = Codecs.open (' c:\\users\\administrator\\desktop\\featurewords.txt ', ' RB ', encoding= ' utf-8 ') Featurewords_value = Codecs.open (' C:\\users\\administrator\\desktop\\featurewords_value.txt ', "A", encoding= "Utf-8") allfw=[] for line in f1:dic = {} line =line.strip (' \ n '). Split (' \ t ') Dic[line[0]]=lin E[1] Allfw.append (DIC) f1.close () #for dic in ALLFW: # for K,v in Dic.iteritems (): # print K,v fw=[] for Lin  E in F2:line=line.strip (' \ n '). Split (' \ t ') fw.append (line[0]) f2.close () #for word in FW: # Print word for dic In Allfw:for word in Fw:if word in dic:for K, V in Dic.iteritems (): Featurewor Ds_value.write (k + ' \ +v+ ' \ n ') featurewords_value.close () Feture_word = [] Feture_word_dic = {} Feture_word_dic2 = {} Fe Aturewords_value = Codecs.open (' C:\\users\\administrator\\desktop\\featurewords_value.txt ', "RB", encoding= "Utf-8" For line in featurewords_value:line = Line.split () IDF = Math.log (4500/float (line[1)), Feture_word.appen D (Line[0]) feture_word_dic[line[0]] = line[1] feture_word_dic2[line[0]] = IDF featurewords_value.close () getTF_re Sult = Codecs.open (' c:\\users\\aDministrator\\desktop\\gettf_result.txt ', "RB", encoding= "Utf-8") all=[] for line in gettf_result:line = Line.strip (' \ n '). Split (' \ t ') single=[] for words into line:single.append (words) all.append (single) #for single All: #print single # for word in: # print Word featurewords_value = Codecs.open (' C:\\users\\administr
    Ator\\desktop\\featurewords_value.txt ', "RB", encoding= "Utf-8") Alltext_vector = [] for single in all: # print Tmax vector = [] for word in feture_word:if word in Single:tmax = single[1] Inde=single . index (Word) t = single[inde+1] else:t = 0 # print T TF_IDF = (float (t)/fl Oat (Tmax)) *float (Feture_word_dic2[word]) vector.append (TF_IDF) alltext_vector.append (vector) for vector in AL LTEXT_VECTOR[0:500]: vector.append (' Economy ') for vector in alltext_vector[500:1000]: vector.append (' IT ') for Vect or in alltext_vector[1000:1500]: vector.append (' Health ') for vector in alltext_vector[1500:2000]: vector.append (' PE ') for vector in Allte 
XT_VECTOR[2000:2500]: vector.append (' Travel ') for vector in alltext_vector[2500:3000]: vector.append (' education ') For vector in alltext_vector[3000:3500]: vector.append (' enployment ') for vector in alltext_vector[3500:4000]: VEC Tor.append (' Culture ') for vector into alltext_vector[4000:4500]: vector.append (' military ') #for vector in Alltext_vecto R: # Print vector # for value in vector: # print Value data = Codecs.open (' c:\\users\\administrator\\desktop\\d Ata.arff ', "a", encoding= "Utf-8") data.truncate () data.write (U ' @relation ' + ' +u ' sougoucorpus ' + ' \ n ') for Everyword in Feture_word:data.write (U ' @attribute ' + ' + everyword + ' +u ' numeric\n ') data.write (U ' @attribute type {economy,it,heal th,pe,travel,educaiton,enployment,culture,military}\n\n@data\n ') for vector in alltext_vector:for value in Vector[:-1 ]: data = CODECS.OPen (' C:\\users\\administrator\\desktop\\data.arff ', "a", encoding= "Utf-8") data.write (str (value) + ', ') DATA.W Rite (str (vector[-1]) + ' \ n ') Data.close () Print (U ' text to quantization processing completed. The results have been exported to the desktop Data.arff. ' + ' \ n ') Print (U ' text preprocessing ended. ' + ' \ n ')


Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.