Python's simplest reptile _

Python's simplest reptile __python

Last Update:2018-07-28 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Main 5 parts: main function, URL Manager, Web page downloader, Web page renderer, Web parser

(set is used, but the following code does not highlight the advantages of set.) Follow-up can be improved)

Main entry function, Spider_main:

Import Url_manager, Html_downloader, Html_outputer, Html_parser class Spidermain (object): Def __init__ (self): Self.urls = Url_manager. Urlmanager () Self.downloader = Html_downloader. Htmldownloader () Self.parser = Html_parser. Htmlparser () Self.outputer = Html_outputer. Htmloutputer () def craw (self, root_url): Count = 1 Self.urls.add_new_url (root_url) #根节点入队 whi
                Le Self.urls.has_new_url (): #类似广度优先搜索 Try:new_url = Self.urls.get_new_url () #从队列中得到一个新的url print ' Craw%d:%s '% (count, new_url) #哪个url正在被爬取 Html_cont = Self.downloader.download (new_ur L) #把这个网页爬下来 new_urls, New_data = Self.parser.parse (New_url, Html_cont) #解析内部的url和数据 Self.ur Ls.add_new_urls (New_urls) #将又爬到的url添加到队列中 self.outputer.collect_data (new_data) #收集数据 Self.ou

Tputer.output_html () #打印数据 if count = = 10:break                Count + 1 except Exception as E:print e print ' Craw failed '
    if __name__ = = "__main__": Root_url = "http://baike.baidu.com/view/21087.htm" #根节点 Obj_spider = Spidermain () Obj_spider.craw (Root_url)

URL Manager:

Class Urlmanager (object):
    def __init__ (self):
        Self.new_urls = set ()
        Self.old_urls = set ()
    def get_new _url (self):
        New_url = Self.new_urls.pop ()
        Self.old_urls.add (new_url) return
        new_url

    def add_new_ URL (self, url): #添加新的url到队列中
        If URL is None: The return
            if URL isn't in Self.new_urls and the URL not in
        self.old_urls:< C11/>self.new_urls.add (URL)

    def add_new_urls (self, URLs): #批量添加
        If URLs is None or len (urls) = 0: #没爬到或者爬到了空串 C14/>return for
        URL in URLs:
            self.add_new_url (URL)

    def has_new_url (self): #判断队列是否为空 return
        Len ( Self.new_urls)!= 0

Web Page Downloader:

Import Urllib2


class Htmldownloader (object):
    def download (self, URL):
        If URL is none: Return
            None
        Response = Urllib2.urlopen (URL) #打开url, and get return value
        if Response.getcode ()!=: Returns
            None
        Response.read () #返回网页的全部内容

Web page parser:

Import re import urlparse from BS4 Import BeautifulSoup class Htmlparser (object): Def _get_new_urls (self, page_url , soup): #从一堆文字中获取url (regular match) New_urls = set () links = soup.find_all (' a ', href = Re.compile (r "/view/\d+\.htm")  ) #获取a标签中, href is the entire contents of the label for the specified format for link in links:new_url = link[' href '] #取其中的href new_full_url = Urlparse.urljoin (Page_url, New_url) #拼接成完整链接 New_urls.add (New_full_url) return New_urls def _ge T_new_data (self, Page_url, soup): #从一堆文字中获取数据 res_data = {} #键值对组合, title key value pair, summary key value pair #url Res_dat a[' url '] = Page_url #<dd class= "Lemmawgt-lemmatitle-title" > Title_node = Soup.find (' dd ', Class_ = "

        Lemmawgt-lemmatitle-title "). Find (' H1 ') res_data[' title ' = Title_node.get_text () #print res_data[' title '] #<div class= "lemma-summary" label-module= "lemmasummary" > Summary_node = Soup.find (' div ', Class_ = "
        Lemma-summary ")res_data[' summary '] = Summary_node.get_text () #print res_data[' summary '] return Res_data def parse ( Self, Page_url, Html_cont): If Page_url is-None or html_cont is none:return soup = beautifuls OUP (Html_cont, ' html.parser ', from_encoding = ' utf-8 ') New_urls = Self._get_new_urls (page_url, soup) new_d ATA = Self._get_new_data (page_url, soup) return New_urls, New_data

Web page renderer:

Class Htmloutputer (object):

    def __init__ (self):
        self.datas = []

    def collect_data (self, data): # Make a list of data (key values pairs)
        If the is None:
            return

        self.datas.append (data)

    def output_html (self):
        fout = Open ("F:\\output.html", ' W ')
        Fout.write ("

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More