Environmental conditions: hadoop2.6.0,spark1.6.0,python2.7, downloading code and data
The code is as follows:
From Pyspark import sparkcontext sc=sparkcontext (' local ', ' Pyspark ') data=sc.textfile ("Hdfs:/user/hadoop/test.txt") Import NLTK from Nltk.corpus import stopwords from functools import reduce def filter_content (content): Content_old=co Ntent content=content.split ("%#%") [-1] sentences=nltk.sent_tokenize (content) #句子化, the input of the sent_tokenize is a text, the return is a standardized sentence Sub-list words=[word.lower () for sentence in sentences for word in nltk.word_tokenize (sentence)] #单词化 Words=[word for W Ord in words if Word isn't in stopwords.words (' 中文版 ') #去除停用词 Words=[word for word in words if Word is not in ['/', ' ^ ', ' -', ' + ', ' < ', ' > ', ' {', '} ', ' * ', '//', ', ', ', ', ', ', ', ', ', ', ', '. ', ' (', ') ', ' [', '] ', ' & ', '! ', ' * ', ' @ ', ' | ', ' # ', ' $ ', '% ', ' "', '" ', "', ', '" ', ', ', ' ", ', ', ', ', ', ' #去除标点和空字符 words=[var[0] for Var in nltk.pos_tag (words) if var[1][0] in [' N ', ' V ']] #词性标注并选 Choose the noun and verb words1=[nltk. Porterstemmer (). STEM (word) for word in words] #Porter提取词干 # WORDS2=[NLTK. Lancasterstemmer (). STEM (word) for word in words] #Lancaster提取词干
# WORDS3=[NLTK. Wordnetlemmatizer (). Lemmatize (word) for word in words] #WordNet提取词元 # words=set (WORDS1+WORDS2+WORDS3) #将三者合并去重 wor DS=WORDS1 if Words:return reduce (lambda a,b:str (a) + "%#%" +str (b), Content_old.split ("%#%") [: -1]) + "%#%" +reduce ( Lambda a,b: "%s%s"% (a,b), words) + ' \ n ' #将单词以空格隔开组成文本并返回 elif content_old.split ("%#%") [1]: return reduce (lambda a , B:str (a) + "%#%" +str (b), Content_old.split ("%#%") [: -1]) + "%#%" + ' \ n ' else:return ' #filter_content ("%#%i am a G
Ood boy. ") Data=data.map (Lambda line:filter_content (line)) Data.saveastextfile ("Hdfs:/user/hadoop/test_result") data_list= Data.collect () with open ("/home/snow/zzwork/test_result.txt", "W") as fw:for var in data_list:fw.write (STR (VA R))