Practical Series Articles:
1 stuttering participle and natural language processing HANLP processing notes
2 Python Chinese corpus batch preprocessing notebooks
3 Notes on Natural language processing
4 Calling the Natural Language processing tool in Python HANLP
5 The use of stuttering participle in python
Corpus Preprocessing Encapsulation Class:
#coding =utf-8import osimport jiebaimport sysimport reimport timeimport jieba.posseg as Psegsys.path.append (". /") Jieba.load_userdict (".. /database/userdict.txt ") # Load Custom word breaker ' title: Text corpus processing using stuttering participle: single-text processor, batch file Processor 1 first traverse the text to find 2 to create a save structure for the original text 3 stutter the original text Word segmentation and deactivation word processing 4 standardize the preprocessing results and save the original file structure path Author: Bai Ningsu myblog:http://www.cnblogs.com/baiboy/"" Word-of-speech tagging and de-disabling words stopwordspath: Stop word path Dealpath: The path of the Chinese data preprocessing file Savepath: Save path for Chinese data preprocessing results ' Def cuttxtword (Dealpath,savepath , Stopwordspath): Stopwords = {}.fromkeys ([Line.rstrip () for line in open (Stopwordspath, "R", encoding= ' Utf-8 ')]) # Stop Vocabulary With open (Dealpath, "R", encoding= ' Utf-8 ') as F:txtlist=f.read () # Read the text to be processed words =pseg.cut (txtlist) # Parts with part-of-speech tagging Word result cutresult= "" # Gets the word breaker after removing the stop word result for word, flag in Words:if Word not in Stopwords:cutresult + = Wo Rd+ "/" +flag+ "" #去停用词 Getflag (cutresult,savepath) # "participle. Pos tagging and de-disabling words stopwordspath: Stop word path Read_folder_path: Chinese data pre-location Path to Write_folder_path: The save path of Chinese data preprocessing results filescount=3XX #设置文件夹下文件最多多少个 "def cutfileword (Read_folder_path,write_folder_path,stopwordspath): # disable Thesaurus stopwords = {}.fromkey S ([Line.rstrip () for line in open (Stopwordspath, "R", encoding= ' Utf-8 ')]) # gets all categories under the root to be processed folder_list = Os.listdir (r Ead_folder_path) # Inter-class loop for folder in Folder_list: #某类下的路径 New_folder_path = Os.path.join (read_folder_ Path, folder) # Create save file directory Path=write_folder_path+folder #保存文件的子文件 isexists=os.path.exists (path) If not isExists:os.makedirs (path) print (path+ ' Create successful ') Else:pass Save_folder_path = Os.path.join (Write_folder_path, folder) #某类下的保存路径 print ('--please wait, in process ... ') # in-class loop files = Os.listd IR (new_folder_path) j = 1 for file in Files:if J > Len (Files): Break Dealpath = OS . path.join (New_folder_path, file) #处理单个文件的路径 with open (Dealpath, "R", encoding= ' Utf-8 ') as F:txtl Ist=f.read () # Python filter Chinese, English punctuation special symbol # TXTLIST1 = Re.sub ("[\s+\.\!\/_,$%^* (+\" \ ']+|[ +--! ,。? , [email protected]#¥%......&* ()]+ "," ", txtlist) words =pseg.cut (txtlist) # Word-of-speech tagging results Cutresult = "" # Single text: Result after word-breaker after word-processing for words, flag in Words:if word ' not ' in stopwords:c Utresult + = word+ "/" +flag+ "" #去停用词 Savepath = Os.path.join (save_folder_path,file) Getflag (cutresult , Savepath) J + = 1 "cutresult:str type of speech filtering, initial segmentation results savepath: Save file path ' Def getflag (cutresult,savepath): txtlist=[ ] #过滤掉的词性后的结果 #词列表为自己定义要过滤掉的词性 cixing=["/x", "/zg", "/uj", "/ul", "/E", "/D", "/uz", "/y"] for line in Cutresult.split (' \ n '): Line_list2=re.split (' [] ', line) line_list2.append ("\ n") # keep the original paragraph format present line_list=line_list2[:] For Segs in Line_list2:for K in Cixing:if K in Segs:line_list.remove ( Segs) Break Else: Pass Txtlist.extend (line_list) # Remove POS tag resultlist=txtlist[:] flagresult= "" For V In Txtlist:if '/' in V:slope=v.index ('/') letter=v[0:slope]+ "" flagresult+= Let ter else:flagresult+= v standdata (flagresult,savepath) "Standardized processing, removing empty lines, whitespace characters, etc. Flagresult: Filtered result ' def standdata (Flagresult,savepath): F2=open (Savepath, "w", encoding= ' Utf-8 ') for line in Flagresul T.split (' \ n '): If Len (line) >=2:line_clean= "/". Join (Line.split ()) lines=line_clean+ "" + "\ N "F2.write (lines) Else:pass f2.close () if __name__ = = ' __main__ ': T1=time.time () # Test a single file Dealpath= ". /database/sogouc/filetest/1.txt "Savepath=". /database/sogouccut/filetest/1.txt "Stopwordspath=". /database/stopwords/ch_stopwords.txt ' stopwordspath1= '. /database/stopwords/hg_stopwords.txt ' # hit deactivate thesaurus # # Rfolder_path = ' in batch folder file #. /database/sogouc/sample/' RFOlder_path = '. /database/sogouc/filenews/' # word processing after saving root path Wfolder_path = '. /database/sogouccut/' # Chinese Corpus Preprocessor # Cuttxtword (Dealpath,savepath,stopwordspath) # Single-Text preprocessor Cutfileword (rfolder_path , Wfolder_path,stopwordspath) # Multi-text preprocessor t2=time.time () print ("Chinese corpus processing completed, time-consuming:" +str (T2-T1) + "seconds. ") #反馈结果
Execution Result:
The batch preprocessing of Python Chinese corpus