//ServerImportsocket, select, Re, queue, Redis fromMultiprocessingImportPool, Cpu_count fromPymongoImportMongoclienthost='192.168.1.107'connectionlist=[]recv_buffer= 4096000Client_status={}client_num={}redis1= Redis. Redis (host='localhost', port=6379, db=0) Num=0classDistributed_web_crawler:def __init__(self, port): Self.url_num= 1Self.queue=queue. Queue () self.db=mongoclient (). Crawspider.content Self.server_socket=Socket.socket (socket.af_inet, socket. Sock_stream) Self.server_socket.bind ((host, Port)) Self.server_socket.listen (10) Self.pool= Pool (Cpu_count ()-1) connectionlist.append (self.server_socket)Print("server running on port:"+Str (port) Address='https://movie.douban.com/'self.queue.put (address) Redis1.set (address, 0) self.main ()defMain (self):GlobalNum while1: if notSelf.queue.empty () andConnectionlist.__len__() > 1 is notNone:self.pool.apply_async (Self.task_manage ()) read_sockets, Write_sockets, error_sockets =Select.select (Connectionlist, [], []) forSockinchread_sockets:ifSock = =Self.server_socket:conn, addr=self.server_socket.accept () connectionlist.append (conn) Core_num= Conn.recv (Recv_buffer). Decode ('UTF8') Client_status[conn]=Core_num Client_num[conn]= Client_num.__len__() + 1Print('Client'+ addr[0] +':'+ str (addr[1]) +'connected, number of cores:'+ Core_num +'\ n is numbered'+STR (client_num[conn]))Else: Data=sock.recv (Recv_buffer)ifdata:contents= Data.decode ('UTF8'). Split ('page_contentpppppp///////') #print (' Received ' +str (Client_num[sock]) + ' number machine sent data, processing ')Client_status[sock] = Int (Client_status[sock]) +Len (Contents)Print('numbering'+str (Client_num[sock]) +'Available Cores'+str (Client_status[sock])) forContentinchContents:ifContent:self.pool.apply_async (self.web_page_resolution (content)) /c8>Else: Print('Client'+ addr[0] +':'+ str (addr[1]) +'Disconnect Connection') Sock.close () Client_status.pop (sock) Client_num . Pop (sock) connectionlist.remove (sock)defweb_page_resolution (self, Content): DB=mongoclient (). Web.data Db.insert ({'page_content': Content}) Pattern= Re.compile ('https://movie.douban.com/(. *?) "') URLs= Re.findall (String=content, pattern=pattern) forUrlinchUrls:url='https://movie.douban.com/'+URLifRedis1.get (URL) isNone:redis1.set (URL, self.url_num) self.queue.put (URL) self.url_num + = 1deftask_manage (self): URLs="' forSocketinchconnectionlist:ifSocket! =Self.server_socket: while notSelf.queue.empty () andInt (Client_status[socket])! =0:urls= URLs + self.queue.get () +' 'Client_status[socket]= Int (Client_status[socket])-1#print (' to ' + str (client_num[socket]) + ' terminal Assignment task ')Socket.send (Urls.encode ('UTF8'))if __name__=="__main__": Port= 8888Distributed_web_crawler (port,)
//ClientImportsocket, sys, select fromMultiprocessingImportCpu_count fromRequestsImportGet fromMultiprocessingImportPoolp= Pool (Cpu_count ()-1) Host='192.168.0.103'page_contents= []defcrawler_page (URL):Print("Crawling Web pages"+URL) content= Get (URL). Content.decode ('UTF8') +'page_contentpppppp///////' Print(URL +"crawl complete, sending data to server") S.send (Content.encode ('UTF8'))deflisting (): while1: Rlist=[Sys.stdin, S] read_list, write_list, Error_list=Select.select (Rlist, [], []) forSockinchread_list:ifSock = =S:data= Sock.recv (4096). Decode ('UTF8') ifData! ='quit' andData:urls=Data.split ()ifLen (urls) = = 1: P.apply_async (Crawler_page (Urls[0]))Else: forUrlinchurls:p.apply_async (crawler_page (URL)) urls.remove (URL) elifdata = ='quit': Print('the client is exiting after receiving the server shutdown instruction') Sys.exit ()Else: Print('Server connection failed, exiting') Sys.exit ()if __name__=="__main__": Port= 8888s=Socket.socket (socket.af_inet, socket. SOCK_STREAM) S.settimeout (3) Try: S.connect (('192.168.1.107', Port)) except: Print("Unable to connect to the server, please check the address and try again") Sys.exit ()Print("connected to server, start sending machine information \ n Core number:"+Str (Cpu_count ())) S.send (str (Cpu_count ())). Encode ('UTF8') ) listing ()
python--Distributed crawler