1. Spider_main
# coding:utf8from Spider_test Import Url_manager, Html_downloader, Html_parser,html_outputerclass SpiderMain (object): def __init__ (self): Self.urls = Url_manager. Urlmanager () Self.downloader = Html_downloader. Htmldownloader () Self.parser = Html_parser. Htmlparser () Self.outputer = Html_outputer. Htmloutputer () def craw (self, root_url): Count = 1 Self.urls.add_new_url (root_url) while S Elf.urls.has_new_url:try:new_url = Self.urls.get_new_url () prin T ("Craw%d:%s"% (count,new_url)) Html_cont = Self.downloader.download (new_url) new_urls, NE W_data = Self.parser.parse (New_url, Html_cont) self.urls.add_new_urls (new_urls) Self.outpute R.collect_data (new_data) if count = = 7:break Count = cou NT + 1 Except:print ("Craw fail") Self.outputer.output_html () if __name__ = = "__main__": Root_url = "Http://baike.bai du.com/view/114149.htm "Obj_spider = Spidermain () obj_spider.craw (Root_url)
2. Url_manager
# Coding:utf8class Urlmanager (object): def __init__ (self): Self.new_urls = set () Self.old_urls = set () def add_new_url (self, url): If URL was None: return if URL not in Self.new_urls and URL not in Self.old_ur LS: self.new_urls.add (URL) # This place use the method def add_new_urls (self, URLs): if URLs are None or len (urls) = = 0: return for URL in URLs: self.add_new_url (URL) # this metho D def has_new_url (self): return len (self.new_urls)! = 0 def get_new_url (self): New_url = self.new _urls.pop () self.old_urls.add (new_url) return New_url
3. Html_downloader
# coding:utf8from Urllib Import Requestclass Htmldownloader (object): def download (self, URL): if URL is none:
return None # Python3.5 different from Python2.7 response = request.urlopen (URL) if Response.getcode ()! = £ º return None return Response.read (). Decode (' utf-8 ', ' ignore ')
4. Html_parser
# coding:utf8import Reimport urllibfrom bs4 import beautifulsoupclass Htmlparser (object): Def _get_new_urls (sel F, Page_url, soup): New_urls = set () links = Soup.find_all ("A", Href=re.compile (r "/view/\d+\.htm")) F or link in links:new_url = link[' href ') # different from Python2.7 New_full_url = Urllib . Parse.urljoin (Page_url, New_url) New_urls.add (New_full_url) return New_urls def _get _new_data (self, Page_url, soup): Res_data = {} res_data[' url '] = page_url # <dl class= "Lemmawgt-l Emmatitle lemmawgt-lemmatitle-"> title_node = Soup.find (' DL ', class_=" Lemmawgt-lemmatitle lemmawgt-lemmatitle-"). Find ("H1") res_data[' title ' = Title_node.get_text () # The key should be right or it would raise an error Summary_node = Soup.find (' div ', Class_ = "lemma-summary") res_data[' summary '] = Summary_node.get_text () ret Urn Res_data Def parse (self, Page_url, Html_cont): If Page_url was None or Html_cont is None:return Soup = BeautifulSoup (Html_cont, "Html.parser", from_encoding= "Utf-8") New_urls = Self._get_new_urls (Page_ URL, soup) new_data = Self._get_new_data (page_url, soup) return New_urls, New_data
5. Html_outputer
# Coding:utf8class Htmloutputer (object): Def __init__ (self): Self.datas = [] def collect_data (sel F, data): If data is None:return self.datas.append (data) # Array use append Method def o Utput_html (self): fout = open (' output.html ', ' W ') fout.write ("Finally get the HTML, some characters can not be displayed, check the information, said to be with the command cmd/k chcp 65001 but with the console query codepage encoding is still, can not change, say is Windows console problem, first such.
Python35 Crawler crawl Baidu Fighter entry