#-*-coding: utf8-*-
import os
import jieba
def splitSentence (inputFile):
fin = open (inputFile, ‘r’) #Open the file in read mode
global fout #Open the file in a written way
#print fin
global stop
for eachLine in fin:
#print eachLine
line = eachLine.strip () #. decode (‘utf-8‘, ‘ignore’) #Remove spaces that may appear at the beginning and end of each line and convert to Unicode for processing
line = line.strip (‘\ n’) #Remove extra blank lines
wordList = list (jieba.cut (line)) #Use stammer to segment words and segment each line of content
#wordList = list (jieba.cut_for_search (line))
outStr = ‘‘
for word in wordList: #
if len (word)> 1:
if not word in stop:
outStr + = word
outStr + = ‘‘
fout.write (outStr.strip (). encode (‘utf-8‘)) #Write the word segmentation result to the output file
fout.write (‘\ n’)
fin.close ()
# path = r ‘/ media / software / zhuomian / VARandLDAr / train‘
# r‘D: / zhuomian / VARandLDA / train ‘
path = ‘/ home / xdj / train’
fns = [os.path.join (root, fn) for root, dirs, files in os.walk (path) for fn in files]
stop = [line.strip (). decode (‘utf-8‘, ‘ignore’) for line in open (‘/ home / xdj / chstop.txt’). readlines ()]
fout = open (‘myOutput.txt’, ‘w’)
fout.write (‘% d‘% len (fns) + ‘\ n’)
for f in fns:
splitSentence (f) #splitSentence (‘/ home / xdj / train / C3-Art / C3-Art1459.txt’, ‘myOutput.txt’)
print (len (fns))
fout.close ()
Python Corpus processing (read folder files from folders, Word breakers, go to stop words, go to single word)