Python Simple crawler

Source: Internet
Author: User
Tags rollback

This article mainly explains how Python implements the simple crawler process

Open seed url--> Get all the url--> in the seed URL page to determine if it has been crawled, and the crawled URL is added to the URL list--> the information required in the parse page--> write to the database

5 objects can be abstracted from the above process: Bootstrap Initiator Downloader downloader parser parser url_manager URL manager outputer output Initiator (main.py)

First look at how the launcher is implemented:

#!/usr/bin/env python # _*_ coding:utf-8 _*_ "" Created on 2017/9/23 @author: TT "" "Import spider_outputer import Spide R_parser Import spider_downloader Import Spider_url_manager class Main:def __init__ (self): Self.urls = SPI Der_url_manager. Urlmanager () Self.downloader = Spider_downloader. Downloader () Self.outputer = Spider_outputer. Outputer () Self.parser = Spider_parser. Parser () def craw (self, root_url): Self.urls.add_new_url (root_url) Count = 1 while self.urls. Has_new_url (): Try:new_url = Self.urls.get_new_url () print (' Craw%d:%s '% ( Count, new_url)) Html_content = Self.downloader.download (new_url) new_urls, New_data = sel F.parser.parser (New_url, html_content) self.urls.add_new_urls (new_urls) self.outputer.coll
              Ect_data (new_data) count = 1 except Exception as E:  Print (e) if __name__ = = "__main__": url = ' https://www.zhihu.com/people/li-xiao-miao-70/activities ' main = Mai
 N () main.craw (URL)
Download (spider_downloader.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import String from
urllib import request from
urllib.parse Import quote

user_agent = ' mozilla/5.0 (Windows NT 6.1; Win64; x64) applewebkit/537.36 '


class Downloader:
    def download (self, URL):
        If URL is none: Return
            None
        _url = quote (URL, safe=string.printable)
        req = Request. Request (_url, headers={' user-agent ': user_agent})
        response = Request.urlopen (req)
        if Response.getcode ()!=
            print ("Request Bad") return to
            None
        html = response.read () return
        html.decode (' UTF8 ')
parser (spider_parser.py)
#!/usr/bin/env python #-*-coding:utf-8-*-"" Created on 2017/9/23 @author: TT "" "' from Urllib import parse from BS4 Import BeautifulSoup Import User_info class Parser:def Parser (self, Page_url, html_content): If Page_url I S-None or html_content is none:return soup = BeautifulSoup (html_content, "Html.parser") New_u RLS = Self._get_new_urls (page_url, soup) new_data = Self._get_new_data (page_url, soup) return new_urls, NE W_data def _get_new_urls (self, Page_url, soup): New_urls = set () links = soup.find_all (' a ') f
            or link in links:new_url = link.get (' href ') New_full_url = Parse.urljoin (Page_url, New_url)  New_urls.add (New_full_url) return New_urls def _get_new_data (self, Page_url, soup): Ret_data = [] Author_infos = Soup.find_all (class_= ' Authorinfo ') print (len (Author_infos)) for Author_info in
          Author_infos:  user_id = Author_info.find (class_= ' Userlink-link '). Get (' href ') user_id = User_id[user_id.find (' e/') + 2:]
            Photo = Author_info.find (class_= ' Authorinfo-avatar '). Get (' Srcset ') pos = Photo.find (') Photo = photo[:p os] name = Author_info.find (' meta ', attrs={' itemprop ': ' Name '}). Get (' content ') pr Ofession = Author_info.find (class_= ' Authorinfo-badgetext '). Get_text () user = User_info. UserInfo (user_id, name, photo, profession) ret_data.append (user) return Ret_data
URL Manager (spider_url_manager.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "


" "Class Urlmanager:
    def __init__ (self):
        Self.visited_url = set ()
        Self.visit_url = set ()

    def add_new_url ( Self, URL):
        If URL is None: ' return if ' url not in Self.visit_url and URL not in
        Self.visited_url:
            self. Visit_url.add (URL)

    def has_new_url (self): return
        len (self.visit_url)!= 0

    def add_new_urls (self, URLs ):
        If URL is None or len (urls) = 0: Return to
            URL in
        URLs:
            self.add_new_url (URLs)

    def get_new_ URL (self):
        New_url = Self.visit_url.pop ()
        Self.visited_url.add (new_url) return
        New_url
output Device (spider_outputer.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import User_dao


class Outputer:
    def __init__ (self):
        Self.user_dao = User_dao. Userdao ()

    def collect_data (self, data):
        If data is none:
            print (' None ')
            return
        else:
            For d in data:
                Self.user_dao.add_user (d)
Other Classes database Help Class (db_helper.py)
#!/usr/bin/env python #-*-coding:utf-8-*-"" Created on 2017/9/23 @author: TT "" "Import Pymysql class DBHelper: def _get_connection (self): return pymysql. Connect (host= ' 127.0.0.1 ', user= ' root ', passwd= ' root ', db= ' crawler ', port=3306, charset= ' UTF8 ') def execute_update (sel f, SQL, params): Connectin = self._get_connection () cursor = Connectin.cursor () try:i F params is none:s = SQL else:s = (sql% params) result = Cursor . Execute (s) connectin.commit () print (' Rows: ', result) except Exception as E:p Rint (e) connectin.rollback () Finally:cursor.close () connectin.close () de
        F Query (self, SQL, params=none): Connection = self._get_connection () cursor = Connection.cursor ()
                Try:if params is none:s = SQL else:s = (sql% params) cursor.execute (s) rows = Cursor.fetchall () for row in rows: 
            Print (row) except Exception as E:print (e) connection.rollback () Finally:
 Cursor.close () Connection.close ()
User data Access Class (user_dao.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import Db_helper


class Userdao:
    def __init__ (self):
        self.db = Db_helper. DBHelper ()

    def add_user (self, user):
        sql = ' INSERT IGNORE into Zhihu_user (user_id, photo, name, profession) v Alues ('%s ', '%s ', '%s ', '%s ') '
        params = (user.user_id, User.photo, User.Name, user.profession)
        Self.db.execute_update (Sql=sql, Params=params)
User Information Class (user_info.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "


" "Class UserInfo:
    def __init__ (self, user_id, name, photo, profession):
        self.user_id = user_id self.name
        = name
        Self.photo = Photo
        self.profession = Profession

    def __str__ (self): return
        ' UserInfo (name= ' + Self.name + ', photo= ' + Self.photo + ', profession= ' + self.profession + ') \ n '

    def __repr__ (self): return
        self. __STR__ ()

In Python the print in the output object is called the object's __str__ function

This small demo crawl is to know the user ID, avatar, name, career information

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.