Scheduler:
ImportUrl_manager,html_downloader,html_parser,html_outputerclassSpidermain (object):"""docstring for Spidermain""" def __init__(self): Self.urls=Url_manager. Urlmanager () Self.downloader=Html_downloader. Htmldownloader () Self.parser=Html_parser. Htmlparser () Self.outputer=Html_outputer. Htmloutputer ()defCraw (Self,root_url): Count= 1Self.urls.add_new_url (Root_url) whileSelf.urls.has_new_url ():Try: New_url=Self.urls.get_new_url () Html_cont=self.downloader.download (New_url)Print('Craw%d:%s'%(Count,new_url))#print (' html_cont:%s '% (html_cont))New_urls,new_data=self.parser.parse (New_url,html_cont) self.urls.add_new_urls (new_urls) self.outputer.co Llect_data (New_data)ifCount = = 1000: BreakCount= Count + 1except: Print('Craw failed') #spidermain.f.close ()self.outputer.output_html ()if __name__=="__main__": Root_url="Https://baike.baidu.com/item/Python"Obj_spider=Spidermain () Obj_spider.craw (Root_url)
URL Manager:
classUrlmanager (object):"""docstring for Urlmanager""" def __init__(self): Self.new_urls=set () Self.old_urls=set ()defAdd_new_url (self,url):ifUrl isNone:return ifUrl not inchSelf.new_urls andUrl not inchself.old_urls:self.new_urls.add (URL)defAdd_new_urls (self,urls):ifURLs isNoneorLen (urls) = =0:return forUrlinchurls:self.add_new_url (URL)defHas_new_url (self):returnLen (self.new_urls)! =0defGet_new_url (self): New_url=Self.new_urls.pop () self.old_urls.add (New_url)#print (self.old_urls) returnNew_url
Web Downloader:
ImportRequestsImportRequests.packages.urllib3.util.ssl_requests.packages.urllib3.util.ssl_. Default_ciphers=' All'classHtmldownloader (object):"""docstring for Htmldownloader""" defDownload (self,url):ifUrl isNone:returnNone#f.write ("In Downloader,url is%s"% (URL)) #print ("In Downloader,url is%s"% (URL))Response =requests.get (URL) response.encoding="Utf-8" #f.write ("in Downloader") #print ("In Downloader,res is%s"% (Response.status_code)) ifResponse.status_code! = 200: returnNonereturnResponse.text
Web parser:
fromBs4ImportBeautifulSoupImportReImportUrllib.parseclassHtmlparser (object):"""docstring for Htmlparser""" def_get_new_urls (self,page_url,soup): New_urls=set () links= Soup.find_all ('a', Href=re.compile (R"/item/*?")) forLinkinchLinks:new_url= link['href'] New_full_url=Urllib.parse.urljoin (Page_url,new_url) new_urls.add (new_full_url)returnNew_urlsdef_get_new_data (self,page_url,soup): Res_data= {} #URLres_data['URL'] =Page_url#<dd class= "Lemmawgt-lemmatitle-title" > Title_node= Soup.find ('DD', class_='Lemmawgt-lemmatitle-title'). Find ("H1") res_data['title'] =Title_node.get_text ()#Div class= "lemma-summary" label-module= "Lemmasummary"Summary_node = Soup.find ('Div', class_='lemma-summary') res_data['Summary'] =Summary_node.get_text ()returnRes_datadefParse (self,page_url,html_cont):#print ("in Parse") ifPage_url isNoneorHtml_cont isNone:returnSoup= BeautifulSoup (Html_cont,'Html.parser') New_urls=self._get_new_urls (page_url,soup) New_data=Self._get_new_data (Page_url,soup)returnNew_urls,new_data
Output:
ImportSpider_mainclassHtmloutputer (object):"""docstring for Htmloutputer""" def __init__(self): Self.datas= [] defCollect_data (self,data):ifData isNone:returnself.datas.append (data)defoutput_html (self): Fout= Open ('output.html','W', encoding='Utf-8') Fout.write ("") Fout.write ("<body>") Fout.write ("<table>") forDatainchSelf.datas:fout.write ("<tr>") Fout.write ("<td>%s</td>"%data['URL']) Fout.write ("<td>%s</td>"%data['title']) Fout.write ("<td>%s</td>"%data['Summary']) Fout.write ("</tr>") Fout.write ("</table>") Fout.write ("</body>") Fout.write ("")
Crawler instances-Crawl python Baidu Wikipedia related 1000 entries