Python's simplest reptile __python

Source: Internet
Author: User

Main 5 parts: main function, URL Manager, Web page downloader, Web page renderer, Web parser

(set is used, but the following code does not highlight the advantages of set.) Follow-up can be improved)

Main entry function, Spider_main:

Import Url_manager, Html_downloader, Html_outputer, Html_parser class Spidermain (object): Def __init__ (self): Self.urls = Url_manager. Urlmanager () Self.downloader = Html_downloader. Htmldownloader () Self.parser = Html_parser. Htmlparser () Self.outputer = Html_outputer. Htmloutputer () def craw (self, root_url): Count = 1 Self.urls.add_new_url (root_url) #根节点入队 whi
                Le Self.urls.has_new_url (): #类似广度优先搜索 Try:new_url = Self.urls.get_new_url () #从队列中得到一个新的url print ' Craw%d:%s '% (count, new_url) #哪个url正在被爬取 Html_cont = Self.downloader.download (new_ur L) #把这个网页爬下来 new_urls, New_data = Self.parser.parse (New_url, Html_cont) #解析内部的url和数据 Self.ur Ls.add_new_urls (New_urls) #将又爬到的url添加到队列中 self.outputer.collect_data (new_data) #收集数据 Self.ou

Tputer.output_html () #打印数据 if count = = 10:break                Count + 1 except Exception as E:print e print ' Craw failed '
    if __name__ = = "__main__": Root_url = "http://baike.baidu.com/view/21087.htm" #根节点 Obj_spider = Spidermain () Obj_spider.craw (Root_url)

URL Manager:

Class Urlmanager (object):
    def __init__ (self):
        Self.new_urls = set ()
        Self.old_urls = set ()
    def get_new _url (self):
        New_url = Self.new_urls.pop ()
        Self.old_urls.add (new_url) return
        new_url

    def add_new_ URL (self, url): #添加新的url到队列中
        If URL is None: The return
            if URL isn't in Self.new_urls and the URL not in
        self.old_urls:< C11/>self.new_urls.add (URL)

    def add_new_urls (self, URLs): #批量添加
        If URLs is None or len (urls) = 0: #没爬到或者爬到了空串 C14/>return for
        URL in URLs:
            self.add_new_url (URL)

    def has_new_url (self): #判断队列是否为空 return
        Len ( Self.new_urls)!= 0

Web Page Downloader:

Import Urllib2


class Htmldownloader (object):
    def download (self, URL):
        If URL is none: Return
            None
        Response = Urllib2.urlopen (URL) #打开url, and get return value
        if Response.getcode ()!=: Returns
            None
        Response.read () #返回网页的全部内容


Web page parser:

Import re import urlparse from BS4 Import BeautifulSoup class Htmlparser (object): Def _get_new_urls (self, page_url , soup): #从一堆文字中获取url (regular match) New_urls = set () links = soup.find_all (' a ', href = Re.compile (r "/view/\d+\.htm")  ) #获取a标签中, href is the entire contents of the label for the specified format for link in links:new_url = link[' href '] #取其中的href new_full_url = Urlparse.urljoin (Page_url, New_url) #拼接成完整链接 New_urls.add (New_full_url) return New_urls def _ge T_new_data (self, Page_url, soup): #从一堆文字中获取数据 res_data = {} #键值对组合, title key value pair, summary key value pair #url Res_dat a[' url '] = Page_url #<dd class= "Lemmawgt-lemmatitle-title" > Title_node = Soup.find (' dd ', Class_ = "

        Lemmawgt-lemmatitle-title "). Find (' H1 ') res_data[' title ' = Title_node.get_text () #print res_data[' title '] #<div class= "lemma-summary" label-module= "lemmasummary" > Summary_node = Soup.find (' div ', Class_ = "
        Lemma-summary ")res_data[' summary '] = Summary_node.get_text () #print res_data[' summary '] return Res_data def parse ( Self, Page_url, Html_cont): If Page_url is-None or html_cont is none:return soup = beautifuls OUP (Html_cont, ' html.parser ', from_encoding = ' utf-8 ') New_urls = Self._get_new_urls (page_url, soup) new_d ATA = Self._get_new_data (page_url, soup) return New_urls, New_data


Web page renderer:

Class Htmloutputer (object):

    def __init__ (self):
        self.datas = []

    def collect_data (self, data): # Make a list of data (key values pairs)
        If the is None:
            return

        self.datas.append (data)

    def output_html (self):
        fout = Open ("F:\\output.html", ' W ')
        Fout.write ("


Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.