To illustrate, the serial version is fast enough to run on my core duo debian7.6 as long as 0.2s, is simply hard to surpass. The multi-process version is difficult to avoid a lot of process creation and data synchronization and transport overhead, performance is not as good as the serial version, only as a learning example. Then optimize it later.
#-------------------------------------------------------------------------------#Name:wordstat_multiprocessing.py#purpose:statistic words in Java files of given directory by multiprocessing##Author:qin.shuq##created:09/10/2014#Copyright: (c) Qin.shuq#Licence: <your licence>#-------------------------------------------------------------------------------ImportReImportOSImport TimeImportLogging fromQueueImportEmpty fromMultiprocessingImportProcess, Manager, Pool, Pipe, Cpu_countlog_levels= { 'DEBUG': Logging. DEBUG,'INFO': Logging.info,'WARN': Logging. WARNING,'ERROR': Logging. ERROR,'CRITICAL': Logging. Critical}ncpu=Cpu_count ()definitlog (filename): Logger=Logging.getlogger () HDLR=logging. Filehandler (filename) Formatter= Logging. Formatter ("% (asctime) s% (levelname) s% (message) s") Hdlr.setformatter (formatter) Logger.addhandler (HDLR) logger.setlevel (log_levels['INFO']) returnLoggererrlog= Initlog ("Error.log") Infolog= Initlog ("Info.log")classFileobtainer (object):def __init__(Self, Dirpath, filefilterfunc=None): Self.dirpath=Dirpath Self.filefilterfunc=FilefilterfuncdefFindallfilesindir (self): Files= [] forPath, dirs, filenamesinchOs.walk (self.dirpath):ifLen (filenames) >0: forFileNameinchfilenames:files.append (Path+'/'+filename)ifSelf.filefilterfunc isNone:returnFilesElse: returnfilter (Self.filefilterfunc, files)classMultiqueue (object):def __init__(self, Qnum, timeout): Manager=Manager () self.timeout=Timeout Self.qnum=Qnum self.queues=[] Self.pindex=0 forIinchRange (self.qnum): Qlines=Manager. Queue () self.queues.append (qlines)defput (self, obj): self.queues[self.pindex].put (obj) self.pindex= (self.pindex+1)%Self.qnumdefGet (self): forIinchRange (self.qnum):Try: obj=Self.queues[i].get (True, Self.timeout)returnobjexceptEmpty, EMP:Print 'Not Get.'Errlog.error ('In wordreading:'+str (EMP))returnNonedefreadFile (filename):Try: F= open (filename,'R') Lines=f.readlines () infolog.info ('[Successful read file%s]\n'%filename) f.close ()returnLinesexceptIOError, Err:errorinfo='file%s not found \ n'%filename errlog.error (errorinfo)return []defbatchreadfiles (FileList, Iopool, MQ): Futureresult= [] forFileNameinchfileList:futureResult.append (Iopool.apply_async (readFile, args=(filename,))) Alllines= [] forResinchFutureResult:allLines.extend (Res.get ()) mq.put (alllines)classwordreading (object):def __init__(self, Allfiles, MQ): Self.allfiles=allfiles self.mq=MQ Self.iopool= Pool (ncpu*3) Infolog.info ('wordreading Initialized') defRun (self): FileNum=Len (allfiles) batchreadfiles (Self.allfiles, Self.iopool, SELF.MQ)defProcesslines (lines): Result={} linescontent="'. Join (lines) matches=WordAnalyzing.wordRegex.findall (linescontent)ifmatches: forWordinchmatches:ifResult.get (Word) isNone:result[word]=0 Result[word]+ = 1returnresultdefMergetosrcmap (Srcmap, destmap): forKey, ValueinchDestmap.iteritems ():ifSrcmap.get (Key): Srcmap[key]= Srcmap.get (key) +destmap.get (Key)Else: Srcmap[key]=destmap.get (Key)returnSrcmapclasswordanalyzing (object):" "return Map<word, count> the occurrence times of each Word" "Wordregex= Re.compile ("[\w]+") def __init__(self, MQ, conn): Self.mq=MQ Self.cpupool=Pool (ncpu) Self.conn=Conn Self.resultmap={} infolog.info ('wordanalyzing Initialized') defRun (self): StartTime=time.time () lines=[] Futureresult= [] whileTrue:lines=Self.mq.get ()ifLines isNone: Breakfutureresult.append (Self.cpuPool.apply_async (processlines, args=(lines,))) Resultmap= {} forResinchFutureresult:mergetosrcmap (Self.resultmap, Res.get ()) Endtime=time.time ()Print 'wordanalyzing analyze Cost:', (Endtime-starttime) *1000,'Ms'Self.conn.send ('OK') Self.conn.close ()defObtainresult (self):returnSelf.resultmapclasspostprocessing (object):def __init__(self, resultmap): Self.resultmap=ResultmapdefSortbyvalue (self):returnSorted (Self.resultMap.items (), key=LambdaE:E[1], reverse=True)defobtaintopn (Self, TopN): Sortedresult=self.sortbyvalue () sortednum=Len (sortedresult) TopN= SortednumifTopN > SortednumElseTopN forIinchRange (TopN): Topi=Sortedresult[i]PrintTopi[0],'counts:', topi[1]if __name__=="__main__": Dirpath="/home/lovesqcc/workspace/java/javastudy/src/" if notos.path.exists (dirpath):Print 'dir%s not found.'%Dirpath exit (1) Fileobtainer= Fileobtainer (Dirpath,LambdaF:f.endswith ('. Java')) Allfiles=Fileobtainer.findallfilesindir () mqtimeout= 0.01Mqnum= 1MQ= Multiqueue (Mqnum, timeout=mqtimeout) P_conn, C_conn=Pipe () WR=wordreading (allfiles, MQ) WA=wordanalyzing (MQ, C_conn) Wr.run () Wa.run () msg=p_conn.recv ()ifmsg = ='OK': Pass #taking less time, parallel not needed.Postproc =postprocessing (Wa.obtainresult ()) POSTPROC.OBTAINTOPN (30) Print 'exit the program.'
Python implements a multi-process version of the Java file Word count in the specified directory