標籤:close pre 批量 letter nlp es2017 sts 文本處理 base
手記實用系列文章:
1 結巴分詞和自然語言處理HanLP處理手記
2 Python中文語料批量預先處理手記
3 自然語言處理手記
4 Python中調用自然語言處理工具HanLP手記
5 Python中結巴分詞使用手記
語料預先處理封裝類:
#coding=utf-8import osimport jiebaimport sysimport reimport timeimport jieba.posseg as psegsys.path.append("../")jieba.load_userdict("../Database/userdict.txt") # 載入自訂分詞詞典‘‘‘title:利用結巴分詞進行文本語料處理:單文本處理器、批量檔案處理器 1 首先對文本進行遍曆尋找 2 建立原始文本的儲存結構 3 對原文本進行結巴分詞和停用詞處理 4 對預先處理結果進行標準化格式,並儲存原檔案結構路徑author:白寧超myblog:http://www.cnblogs.com/baiboy/‘‘‘‘‘‘分詞.詞性標註以及去停用詞stopwordspath: 停用詞路徑dealpath:中文資料預先處理檔案的路徑savepath:中文資料預先處理結果的儲存路徑‘‘‘def cutTxtWord(dealpath,savepath,stopwordspath): stopwords = {}.fromkeys([ line.rstrip() for line in open(stopwordspath,"r",encoding=‘utf-8‘)]) # 停用詞表 with open(dealpath,"r",encoding=‘utf-8‘) as f: txtlist=f.read() # 讀取待處理的文本 words =pseg.cut(txtlist) # 帶詞性標註的分詞結果 cutresult=""# 擷取去除停用詞後的分詞結果 for word, flag in words: if word not in stopwords: cutresult += word+"/"+flag+" " #去停用詞 getFlag(cutresult,savepath) #‘‘‘分詞.詞性標註以及去停用詞stopwordspath: 停用詞路徑read_folder_path :中文資料預先處理檔案的路徑write_folder_path :中文資料預先處理結果的儲存路徑filescount=300 #設定檔案夾下檔案最多多少個‘‘‘def cutFileWord(read_folder_path,write_folder_path,stopwordspath): # 停用詞表 stopwords = {}.fromkeys([ line.rstrip() for line in open(stopwordspath,"r",encoding=‘utf-8‘)]) # 擷取待處理根目錄下的所有類別 folder_list = os.listdir(read_folder_path) # 類間迴圈 for folder in folder_list: #某類下的路徑 new_folder_path = os.path.join(read_folder_path, folder) # 建立儲存檔案目錄 path=write_folder_path+folder #儲存檔案的子檔案 isExists=os.path.exists(path) if not isExists: os.makedirs(path) print(path+‘ 建立成功‘) else: pass save_folder_path = os.path.join(write_folder_path, folder)#某類下的儲存路徑 print(‘--> 請稍等,正在處理中...‘) # 類內迴圈 files = os.listdir(new_folder_path) j = 1 for file in files: if j > len(files): break dealpath = os.path.join(new_folder_path, file) #處理單個檔案的路徑 with open(dealpath,"r",encoding=‘utf-8‘) as f: txtlist=f.read() # python 過濾中文、英文標點特殊符號 # txtlist1 = re.sub("[\s+\.\!\/_,$%^*(+\"\‘]+|[+——!,。?、[email protected]#¥%……&*()]+", "",txtlist) words =pseg.cut(txtlist) # 帶詞性標註的分詞結果 cutresult="" # 單個文本:分詞後經停用詞處理後的結果 for word, flag in words: if word not in stopwords: cutresult += word+"/"+flag+" " #去停用詞 savepath = os.path.join(save_folder_path,file) getFlag(cutresult,savepath) j += 1‘‘‘做詞性篩選cutresult:str類型,初切分的結果savepath: 儲存檔案路徑‘‘‘def getFlag(cutresult,savepath): txtlist=[] #過濾掉的詞性後的結果 #詞列表為自己定義要過濾掉的詞性 cixing=["/x","/zg","/uj","/ul","/e","/d","/uz","/y"] for line in cutresult.split(‘\n‘): line_list2=re.split(‘[ ]‘, line) line_list2.append("\n") # 保持原段落格式存在 line_list=line_list2[:] for segs in line_list2: for K in cixing: if K in segs: line_list.remove(segs) break else: pass txtlist.extend(line_list) # 去除詞性標籤 resultlist=txtlist[:] flagresult="" for v in txtlist: if "/" in v: slope=v.index("/") letter=v[0:slope]+" " flagresult+= letter else: flagresult+= v standdata(flagresult,savepath)‘‘‘標準化處理,去除空行,空白字元等。flagresult:篩選過的結果‘‘‘def standdata(flagresult,savepath): f2=open(savepath,"w",encoding=‘utf-8‘) for line in flagresult.split(‘\n‘): if len(line)>=2: line_clean="/ ".join(line.split()) lines=line_clean+" "+"\n" f2.write(lines) else: pass f2.close()if __name__ == ‘__main__‘ : t1=time.time() # 測試單個檔案 dealpath="../Database/SogouC/FileTest/1.txt" savepath="../Database/SogouCCut/FileTest/1.txt" stopwordspath=‘../Database/stopwords/CH_stopWords.txt‘ stopwordspath1=‘../Database/stopwords/HG_stopWords.txt‘ # 哈工大停用詞表 # 批量處理檔案夾下的檔案 # rfolder_path = ‘../Database/SogouC/Sample/‘ rfolder_path = ‘../Database/SogouC/FileNews/‘ # 分詞處理後儲存根路徑 wfolder_path = ‘../Database/SogouCCut/‘ # 中文語料前置處理器 # cutTxtWord(dealpath,savepath,stopwordspath) # 單文本前置處理器 cutFileWord(rfolder_path,wfolder_path,stopwordspath) # 多文本前置處理器 t2=time.time() print("中文語料語處理完成,耗時:"+str(t2-t1)+"秒。") #反饋結果執行結果:
Python中文語料批量預先處理手記