python implements data cleansing, filtering out words that have difficulty in every article
#coding =utf-8 from collections import counter# for Frequency Statistics import re import OS import JSON import time import sys? from SYS impor T argv exclude = [] with open (' Simplewords.json ', ' R ') as Fo:exclude = Json.load (FO) time.sleep (1) def getorigianlfor M (word): If not Word.isalpha (): Return False if Word in Exclude:return False if Word.endswith (' ment ') or word.endswith (' ness '): Word = word[0:-4] If Word in Exclude:return False if w
Ord.endswith (' tion '): Word = word[0:-4] If Word in exclude:return False word = = ' t '
If Word in exclude:return False Word = word[0:-1] + ' e ' if Word in exclude:
Return False if word.endswith (' ing '): Word = word[0:-3] If Word in exclude:return false Word + = ' e ' if Word in Exclude:return False if Word.endswith (' ies '): Word = Word [0:-3] If Word IN Exclude:return false Word + = ' y ' if Word in Exclude:return False if WORD.E
Ndswith (' es '): Word = word[0:-2] If Word in exclude:return False word = = ' E '
If Word in exclude:return False word = = ' s ' If Word.endswith (' ers '): Word = word[0:-1] If Word in exclude:return False Word = word[0:-1] If Word in Exclude:retu RN false word = word[0:-1] If Word in Exclude:return False return True if WORD.E
Ndswith (' est '): Word = word[0:-3] If Word in exclude:return False word = word[0:-1] If Word in Exclude:return False if Word.endswith (' IED '): Word = word[0:-2] if wor D in Exclude:return false word + = ' y ' if Word in exclude:return false re Turn True if WORD.ENdswith (' Ted ') or Word.endswith (' ded '): Word = word[0:-2] If Word in Exclude:return False Word = word[0:-1] If Word in Exclude:return False return True if Word.endswith (' Ed ' ): Word = word[0:-2] If Word in exclude:return False word + = ' e ' if Word in E Xclude:return False return True if Word.endswith (' s ') and Len (word) >3:word = word[0:
-1] If Word in Exclude:return False if Word.endswith (' ly '): If Word.endswith (' ily '):
Word = word[0:-3]+ ' y ' if Word in exclude:return False word = word[0:-2] If Word in Exclude:return False return True todayall = [] def WC (Filename,outpath): Global Today All resultdict = [] Wordlst = None with open (filename, ' R ') as Fwc:for line in Fwc:conte NT = Re.sub (' [-\ ' \|,.) (“"]", "", Line.lower ()) LST = Content.split (') Lst1 = [I.lower () for i in LST if Len (i) >2 and Getorigianlform (i) and I.isalpha ()] Resultdict.extend (lst1) wordlst = Counter (resultdict) Diclen = l En (wordlst) mb = Wordlst.most_common Mbai = [item[0] for item in MB] Print ('----------------------------- ---------------------------------most ') print (', '. Join (Mbai)) Allwordspre = Wordlst.most_common (Diclen) r Esult = [] Allwords = [item[0] for item in Allwordspre if Len (item[0)) >2 and item[0] don't in exclude
Allwords:if i.endswith (' ing '): i = i[0:-3] if I.endswith (' ings '): i = i[0:-4] If I.endswith (' ers '): i = i[0:-1] if I.endswith (' ies ') or i.endswith (' IED '): i = i[0:
-3]+ ' y ' if I.endswith (' ded '): i = i[0:-2] if I.endswith (' es ') or i.endswith (' ts '): i = i[0:-1] IfI in Exclude:continue if I.isalpha (): Result.append (i) baistr = ', '. Join (Mbai) + ' \ n ' Datas = baistr+ ', '. Join (list (result)) Todayall.append (', '. Join (list (set))) print ('--------------
------------------------------------------------all words string ") print (Datas) with open (Outpath, ' W ') as FO:
Fo.write (datas) If __name__ = "__main__": try:script_name,datestr = argv except Exception as err: Print (err) Datestr = time.strftime ('%y-%m-%d ', Time.localtime (Time.time ())) Toyear,tomonth,today = List
(Map (Int,datestr.split ('-')) Readpath = './mds/' + datestr + '/papers/' Outpath = './mds/' + datestr + '/words/' Readmds = [] Testgo = False if TESTGO:WC (' todaymd.md ', ' Todaymd.txt ') sys.exit () if Os.path.
Exists (Readpath): For item in Os.listdir (Readpath): Readmds.append ([Readpath+item,item]) Else: Print (' Dir not Found! ')
Os.makedirs (Readpath) sys.exit () If Len (Readmds) <= 0:print (' NO PAPER to filter! ') Sys.exit () If not os.path.exists (Outpath): Os.makedirs (Outpath) for paper in Readmds:countpaper = paper[0] Outwords = outpath + paper[1][0:-2]+ ' txt ' print (countpaper,outwords) WC (countpaper,out Words) Todayallstr = ', '. Join (todayall) Todayallstr = Todayallstr.split (', ') Todayallstr = List (set (Todayallst r)) Todayallstr = ', '. Join (todayallstr) todayallwords = './mds/' + datestr + '/allwords.txt ' with open (Todayal
Lwords, ' W ') as Foo:foo.write (TODAYALLSTR)