Main 5 parts: main function, URL Manager, Web page downloader, Web page renderer, Web parser
(set is used, but the following code does not highlight the advantages of set.) Follow-up can be improved)
Main entry function, Spider_main:
Import Url_manager, Html_downloader, Html_outputer, Html_parser class Spidermain (object): Def __init__ (self): Self.urls = Url_manager. Urlmanager () Self.downloader = Html_downloader. Htmldownloader () Self.parser = Html_parser. Htmlparser () Self.outputer = Html_outputer. Htmloutputer () def craw (self, root_url): Count = 1 Self.urls.add_new_url (root_url) #根节点入队 whi
Le Self.urls.has_new_url (): #类似广度优先搜索 Try:new_url = Self.urls.get_new_url () #从队列中得到一个新的url print ' Craw%d:%s '% (count, new_url) #哪个url正在被爬取 Html_cont = Self.downloader.download (new_ur L) #把这个网页爬下来 new_urls, New_data = Self.parser.parse (New_url, Html_cont) #解析内部的url和数据 Self.ur Ls.add_new_urls (New_urls) #将又爬到的url添加到队列中 self.outputer.collect_data (new_data) #收集数据 Self.ou
Tputer.output_html () #打印数据 if count = = 10:break Count + 1 except Exception as E:print e print ' Craw failed '
if __name__ = = "__main__": Root_url = "http://baike.baidu.com/view/21087.htm" #根节点 Obj_spider = Spidermain () Obj_spider.craw (Root_url)
URL Manager:
Class Urlmanager (object):
def __init__ (self):
Self.new_urls = set ()
Self.old_urls = set ()
def get_new _url (self):
New_url = Self.new_urls.pop ()
Self.old_urls.add (new_url) return
new_url
def add_new_ URL (self, url): #添加新的url到队列中
If URL is None: The return
if URL isn't in Self.new_urls and the URL not in
self.old_urls:< C11/>self.new_urls.add (URL)
def add_new_urls (self, URLs): #批量添加
If URLs is None or len (urls) = 0: #没爬到或者爬到了空串 C14/>return for
URL in URLs:
self.add_new_url (URL)
def has_new_url (self): #判断队列是否为空 return
Len ( Self.new_urls)!= 0
Web Page Downloader:
Import Urllib2
class Htmldownloader (object):
def download (self, URL):
If URL is none: Return
None
Response = Urllib2.urlopen (URL) #打开url, and get return value
if Response.getcode ()!=: Returns
None
Response.read () #返回网页的全部内容
Web page parser:
Import re import urlparse from BS4 Import BeautifulSoup class Htmlparser (object): Def _get_new_urls (self, page_url , soup): #从一堆文字中获取url (regular match) New_urls = set () links = soup.find_all (' a ', href = Re.compile (r "/view/\d+\.htm") ) #获取a标签中, href is the entire contents of the label for the specified format for link in links:new_url = link[' href '] #取其中的href new_full_url = Urlparse.urljoin (Page_url, New_url) #拼接成完整链接 New_urls.add (New_full_url) return New_urls def _ge T_new_data (self, Page_url, soup): #从一堆文字中获取数据 res_data = {} #键值对组合, title key value pair, summary key value pair #url Res_dat a[' url '] = Page_url #<dd class= "Lemmawgt-lemmatitle-title" > Title_node = Soup.find (' dd ', Class_ = "
Lemmawgt-lemmatitle-title "). Find (' H1 ') res_data[' title ' = Title_node.get_text () #print res_data[' title '] #<div class= "lemma-summary" label-module= "lemmasummary" > Summary_node = Soup.find (' div ', Class_ = "
Lemma-summary ")res_data[' summary '] = Summary_node.get_text () #print res_data[' summary '] return Res_data def parse ( Self, Page_url, Html_cont): If Page_url is-None or html_cont is none:return soup = beautifuls OUP (Html_cont, ' html.parser ', from_encoding = ' utf-8 ') New_urls = Self._get_new_urls (page_url, soup) new_d ATA = Self._get_new_data (page_url, soup) return New_urls, New_data
Web page renderer:
Class Htmloutputer (object):
def __init__ (self):
self.datas = []
def collect_data (self, data): # Make a list of data (key values pairs)
If the is None:
return
self.datas.append (data)
def output_html (self):
fout = Open ("F:\\output.html", ' W ')
Fout.write ("