1.spider_main
# coding:utf8from Baike_spider Import Url_manager, Html_downloader, Html_parser, Html_outputerclass SpiderMain (object) : Def __init__ (self): Self.urls = Url_manager. Urlmanager () Self.downloader = Html_downloader. Htmldownloader () Self.parser = Html_parser. Htmlparser () Self.outputer = Html_outputer. Htmloutputer () def craw (self, root_url): Count = 1 Self.urls.add_new_url (root_url) While Self.urls.has_new_url (): Try:new_url = Self.urls.get_new_url () Print ("Craw%d:%s"% (count,new_url)) Html_cont = Self.downloader.download (new_url) New _urls, New_data = Self.parser.parse (New_url, Html_cont) self.urls.add_new_urls (new_urls) sel F.outputer.collect_data (new_data) if count = = 100:break Count = Count + 1 except:print ("CrAW fail ") self.outputer.output_html () if __name__ = =" __main__ ": Root_url =" http://baike.baidu.com /view/21087.htm "Obj_spider = Spidermain () obj_spider.craw (Root_url)
2.url_manager
# Coding:utf8class Urlmanager (object): def __init__ (self): Self.new_urls = set () Self.old_urls = set () def add_new_url (self, url): If URL was None: return if URL not in Self.new_urls and URL not in Self.old_ur LS: self.new_urls.add (URL) def add_new_urls (self, URLs): If URLs are None or len (urls) = = 0: return For URL in URLs: self.add_new_url (URL) def has_new_url (self): return len (self.new_urls)! = 0 def get_new_url (self): New_url = Self.new_urls.pop () self.old_urls.add (new_url) return New_ Url
3.html_downloader
# Coding:utf8import Urllib2class Htmldownloader (object): def download (self, URL): If URL is None: return None response = Urllib2.urlopen (URL) if Response.getcode ()! =: return None return Response.read ()
4.html_parser
# Coding:utf8
From BS4 import beautifulsoupimport reimport urlparseclass Htmlparser (object): Def _get_new_urls (self, Page_url, soup): New_urls = set () #/view/123.htm links = soup.find_all ("a", href = Re.compile (r "/view/\d+\.htm")) For link in links:new_url = link[' href '] New_full_url = Urlparse.urljoin (Page_url, New_url) New_urls.add (New_full_url) return New_urls def _get_new_data (self, Page_url, soup): Res_data = {} res_data[' url '] = page_url # <dl class= "Lemmawgt-lemmatitle Lemmawgt-lemmat itle-">5.html_outputer
# Coding:utf-8class Htmloutputer (object): def __init__ (self): self.datas = [] def collect_data (self, data ): If data is None: return self.datas.append (data) def output_html (self): fout = open (' Output.html ', ' W ') fout.write ("
Python crawler Practice crawl Baidu Encyclopedia Python entry