Brief introduction
Crawler is a program that automatically crawls Internet information. Its value is the Internet data for me all. Using crawled data, you can do a lot of things, such as: You can do data statistics, contrast, you can use the crawled data to do a certain aspect of the app, you can also use the crawled data to make a news reader and so on.
Crawler architecture
1) URL Manager
2) Web Downloader
3) Web Analyzer
4) Crawler Call device
5) Use of Value data
Crawler implementation
1) Scheduler Implementation
# coding:utf-8import Url_managerimport html_downloaderimport html_parserimport html_outputerimport url_managerclass Spidermain (object): Def __init__ (self): Self.urls = Url_manager. Urlmanager () Self.downloader = Html_downloader. Htmldownloader () Self.parser = Html_parser. Htmlparser () Self.outputer = Html_outputer. Htmloutputer () def craw (self, root_url): Count = 1 Self.urls.add_new_url (root_url) while Self.urls . Has_new_url (): Try:new_url = Self.urls.get_new_url () print "Craw%d:%s"% (CO UNT, new_url) Html_cont = Self.downloader.download (new_url) new_urls, new_data = Self.parser . Parse (New_url, Html_cont) self.urls.add_new_urls (new_urls) self.outputer.collect_data (new_d ATA) if Count = = 1000:break Count = count + 1 except: Print "Craw failed" self.outputEr.output_html () if __name__ = = "__main__": Root_url = "http://baike.baidu.com/view/21087.htm" Obj_spider = Spidermai N () Obj_spider.craw (Root_url)
2) URL Manager implementation
Class Urlmanager (object): def __init__ (self): Self.new_urls = set () Self.old_urls = set () def add_new _url (self, url): If URL was None: return if URL not in Self.new_urls and URL not in Self.old_urls: self.new _urls.add (URL) def add_new_urls (self, URLs): If URLs are None or len (urls) = = 0: Return for the URL in urls:< C12/>self.add_new_url (URL) def has_new_url (self): return len (self.new_urls)! = 0 def get_new_url ( Self): New_url = Self.new_urls.pop () self.old_urls.add (new_url) return New_url
3) URL Downloader implementation
Import Urllib2class Htmldownloader (object): def download (self, URL): If URL is none: return None Response = Urllib2.urlopen (URL) if Response.getcode ()! =: return None return Response.read ()
4) URL Parser implementation
From BS4 import beautifulsoupimport reimport urlparseclass Htmlparser (object): Def _get_new_urls (self, Page_url, soup): New_urls = set () links = soup.find_all (' A ', Href=re.compile (r "/view/\d+\.htm")) for link in Links: New_url = link[' href '] New_full_url = Urlparse.urljoin (Page_url, New_url) New_urls.add (new _full_url) return New_urls def _get_new_data (self, Page_url, soup): Res_data = {} res_data[' URL '] = Page_url Title_node = soup.find (' dd ', class_= ' lemmawgt-lemmatitle-title '). Find ("H1") res_data[' title '] = t Itle_node.get_text () Summary_node = Soup.find (' div ', class_= "lemma-summary") res_data[' summary '] = Summary_n Ode.get_text () return Res_data def parse (self, Page_url, Html_cont): If Page_url was None or Html_cont is None:return soup = BeautifulSoup (Html_cont, ' Html.parser ', from_encoding= ' utf-8 ') New_urls = sel F._get_new_urls (Page_url, soup) New_data = Self._get_new_data (page_url, soup) return New_urls, New_data
5) Value data output display
# Coding:utf-8class Htmloutputer (object): def __init__ (self): self.datas = [] def collect_data (self, data ): If data is None: return self.datas.append (data) def output_html (self): fout = open (' Output.html ', ' W ') fout.write ("
Perform
This crawler crawl Baidu encyclopedia with the Python keyword related to the 1000 static Web pages, the data in the Web page, the main extraction of keywords and summary information, and the crawling information is stored in an HTML file, and then opened by the browser to achieve access.