Practical Series Articles:
1 stuttering participle and natural language processing HANLP processing notes
2 Python Chinese corpus batch preprocessing notebooks
3 Notes on Natural language processing
4 Calling the Natural Language processing tool in Python HANLP
5 The use of stuttering participle in python
Code Encapsulation Class:
#!/usr/bin/env python#-*-coding:utf-8-*-import jiebaimport osimport reimport timefrom jpype import * ' ' title: Using stutter participle for text Bulk processing of this corpus 1 first to traverse the text to find 2 to create a save structure of the original text 3 stutter Word and stop word processing for the original text 4 standardize the preprocessing results and save the original file structure path Author: Bai Ningsu myblog:http://www. cnblogs.com/baiboy/time:2017 April 28 10:03:09 "" Create a file directory path: Create subdirectory under root directory ' def mkdir (path): # Determine if the path exists isexists= Os.path.exists (path) # judgment result if not isExists:os.makedirs (path) print (path+ ' Create succeeded ') return True Else:pass print (' and later, text is being preprocessed ... ') ' stutter participle tool for Chinese word processing: Read_folder_path: Primitive Corpus root path to be processed write_folder_path Chinese participle Data Cleansing Corpus "Def chsegment (Read_folder_path,write_folder_path): Stopwords ={}.fromkeys ([Line.strip () for line in open ( ‘.. /database/stopwords/ch_stopwords.txt ', ' R ', encoding= ' Utf-8 ')] # deactivate the word list # get all categories under the root of the processing folder_list = Os.listdir (read_ Folder_path) # Inter-class loop # print (folder_list) for folder in Folder_list: #某类下的路径 New_folder_path = OS.PA Th.join (Read_folder_path, folder# Create a consistent save file path mkdir (write_folder_path+folder) #某类下的保存路径 Save_folder_path = Os.path.join (WRI Te_folder_path, folder) #某类下的全部文件集 # in-class loop files = Os.listdir (new_folder_path) j = 1 for File in Files:if j > Len (Files): Break # reads raw Corpus raw = open (os.path.jo In (New_folder_path, file), ' R ', encoding= ' utf-8 '). Read () # preserves only Chinese characters # raw1 = Re.sub ("[A-za-z0-9\[\ ' \~\!\@ \#\$\^\&\*\ (\) \=\|\{\}\ ' \:\;\ ' \,\[\]\.\<\>\/\?\~\! \@\#\\\&\*\%] "," ", raw) # Jieba participle wordslist = jieba.cut (Raw, cut_all=false) # Precision Mode # Disable word processing cutwordlist= ' for word in wordslist:if Word not in stopwords and word== ' \ n ': cutwordlist+= "\ n" # keeps the original text wrapping format Elif len (word) >1:cutwordlist +=word+ "/" #去除空格 #保存清洗后的数据 with Open (Os.path.join (SAVE_FOLDER_PATh,file), ' W ', encoding= ' Utf-8 ') as F:f.write (cutwordlist) J + = 1 "stutter participle tool for Chinese word processing: Read_folde R_path: The original corpus root path to be processed Write_folder_path Chinese word after data cleansing "Def hanlpseg (Read_folder_path,write_folder_path): STARTJVM ( Getdefaultjvmpath (), "-djava.class.path=c:\hanlp\hanlp-1.3.2.jar; C:\HANLP ","-xms1g ","-xmx1g ") # start Jvm,linux to replace the semicolon; a colon: Stopwords ={}.fromkeys ([Line.strip () for line in Open ('). /database/stopwords/ch_stopwords.txt ', ' R ', encoding= ' Utf-8 ')] # deactivate the word list # get all categories under the root of the processing folder_list = Os.listdir (read_ Folder_path) # Inter-class loop # print (folder_list) for folder in Folder_list: #某类下的路径 New_folder_path = OS.PA Th.join (Read_folder_path, folder) # Create a consistent save file path mkdir (write_folder_path+folder) #某类下的保存路径 sav E_folder_path = Os.path.join (write_folder_path, folder) #某类下的全部文件集 # in-class loop files = Os.listdir (new_fold Er_path) j = 1 for file in Files:if J > Len (Files):Break # reads Raw Corpus raw = open (Os.path.join (new_folder_path, file), ' R ', encoding= ' utf-8 '). Read () # HANLP Participle HANLP = jclass (' Com.hankcs.hanlp.HanLP ') wordslist = hanlp.segment (raw) #保存清洗 After the data wordslist1=str (wordslist). Split (",") # Print (Wordslist1[1:len (wordslist1)-1]) Flagre sult= "" # Remove label for V in Wordslist1[1:len (Wordslist1)-1]: If "/" in V: Slope=v.index ("/") Letter=v[1:slope] If Len (letter) >0 and ' \n\u3000\u3000 ' in letter:flagresult+= "\ n" Else:flagresult+=letter + "/" #去除空格 # Print ( Flagresult) with open (Os.path.join (save_folder_path,file), ' W ', encoding= ' Utf-8 ') as F:f.write (fl Agresult.replace ('/', ') ') j + = 1 SHUTDOWNJVM () if __name__ = = ' __main__ ': print (' Start text word breaker: \ n ') T1 = Time.time () Dealpath=".. /database/sogouc/filetest/"Savepath=". /database/sogouccut/filetest/"# Corpus category set root for participle Read_folder_path = '. /database/sogouc/filenews/' Write_folder_path = '. /database/sogouccut/' #jieba中文分词 chsegment (Read_folder_path,write_folder_path) #300个txtq其中结巴分词使用3.31 Seconds Hanlpseg (re Ad_folder_path,write_folder_path) #300个txt其中hanlp分词使用1.83 sec t2 = time.time () print (' Finish Chinese text slice: ' +str (t2-t1) + ' seconds. ")
Operating effect:
The HANLP processing of stuttering participle and natural language processing