Python Simple crawler

Last Update:2018-07-24 Source: Internet

Author: User

Tags rollback

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

This article mainly explains how Python implements the simple crawler process

Open seed url--> Get all the url--> in the seed URL page to determine if it has been crawled, and the crawled URL is added to the URL list--> the information required in the parse page--> write to the database

5 objects can be abstracted from the above process: Bootstrap Initiator Downloader downloader parser parser url_manager URL manager outputer output Initiator (main.py)

First look at how the launcher is implemented:

#!/usr/bin/env python # _*_ coding:utf-8 _*_ "" Created on 2017/9/23 @author: TT "" "Import spider_outputer import Spide R_parser Import spider_downloader Import Spider_url_manager class Main:def __init__ (self): Self.urls = SPI Der_url_manager. Urlmanager () Self.downloader = Spider_downloader. Downloader () Self.outputer = Spider_outputer. Outputer () Self.parser = Spider_parser. Parser () def craw (self, root_url): Self.urls.add_new_url (root_url) Count = 1 while self.urls. Has_new_url (): Try:new_url = Self.urls.get_new_url () print (' Craw%d:%s '% ( Count, new_url)) Html_content = Self.downloader.download (new_url) new_urls, New_data = sel F.parser.parser (New_url, html_content) self.urls.add_new_urls (new_urls) self.outputer.coll
              Ect_data (new_data) count = 1 except Exception as E:  Print (e) if __name__ = = "__main__": url = ' https://www.zhihu.com/people/li-xiao-miao-70/activities ' main = Mai
 N () main.craw (URL)

Download (spider_downloader.py)

#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import String from
urllib import request from
urllib.parse Import quote

user_agent = ' mozilla/5.0 (Windows NT 6.1; Win64; x64) applewebkit/537.36 '


class Downloader:
    def download (self, URL):
        If URL is none: Return
            None
        _url = quote (URL, safe=string.printable)
        req = Request. Request (_url, headers={' user-agent ': user_agent})
        response = Request.urlopen (req)
        if Response.getcode ()!=
            print ("Request Bad") return to
            None
        html = response.read () return
        html.decode (' UTF8 ')

parser (spider_parser.py)

#!/usr/bin/env python #-*-coding:utf-8-*-"" Created on 2017/9/23 @author: TT "" "' from Urllib import parse from BS4 Import BeautifulSoup Import User_info class Parser:def Parser (self, Page_url, html_content): If Page_url I S-None or html_content is none:return soup = BeautifulSoup (html_content, "Html.parser") New_u RLS = Self._get_new_urls (page_url, soup) new_data = Self._get_new_data (page_url, soup) return new_urls, NE W_data def _get_new_urls (self, Page_url, soup): New_urls = set () links = soup.find_all (' a ') f
            or link in links:new_url = link.get (' href ') New_full_url = Parse.urljoin (Page_url, New_url)  New_urls.add (New_full_url) return New_urls def _get_new_data (self, Page_url, soup): Ret_data = [] Author_infos = Soup.find_all (class_= ' Authorinfo ') print (len (Author_infos)) for Author_info in
          Author_infos:  user_id = Author_info.find (class_= ' Userlink-link '). Get (' href ') user_id = User_id[user_id.find (' e/') + 2:]
            Photo = Author_info.find (class_= ' Authorinfo-avatar '). Get (' Srcset ') pos = Photo.find (') Photo = photo[:p os] name = Author_info.find (' meta ', attrs={' itemprop ': ' Name '}). Get (' content ') pr Ofession = Author_info.find (class_= ' Authorinfo-badgetext '). Get_text () user = User_info. UserInfo (user_id, name, photo, profession) ret_data.append (user) return Ret_data

URL Manager (spider_url_manager.py)

#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "


" "Class Urlmanager:
    def __init__ (self):
        Self.visited_url = set ()
        Self.visit_url = set ()

    def add_new_url ( Self, URL):
        If URL is None: ' return if ' url not in Self.visit_url and URL not in
        Self.visited_url:
            self. Visit_url.add (URL)

    def has_new_url (self): return
        len (self.visit_url)!= 0

    def add_new_urls (self, URLs ):
        If URL is None or len (urls) = 0: Return to
            URL in
        URLs:
            self.add_new_url (URLs)

    def get_new_ URL (self):
        New_url = Self.visit_url.pop ()
        Self.visited_url.add (new_url) return
        New_url

output Device (spider_outputer.py)

#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import User_dao


class Outputer:
    def __init__ (self):
        Self.user_dao = User_dao. Userdao ()

    def collect_data (self, data):
        If data is none:
            print (' None ')
            return
        else:
            For d in data:
                Self.user_dao.add_user (d)

Other Classes database Help Class (db_helper.py)

#!/usr/bin/env python #-*-coding:utf-8-*-"" Created on 2017/9/23 @author: TT "" "Import Pymysql class DBHelper: def _get_connection (self): return pymysql. Connect (host= ' 127.0.0.1 ', user= ' root ', passwd= ' root ', db= ' crawler ', port=3306, charset= ' UTF8 ') def execute_update (sel f, SQL, params): Connectin = self._get_connection () cursor = Connectin.cursor () try:i F params is none:s = SQL else:s = (sql% params) result = Cursor . Execute (s) connectin.commit () print (' Rows: ', result) except Exception as E:p Rint (e) connectin.rollback () Finally:cursor.close () connectin.close () de
        F Query (self, SQL, params=none): Connection = self._get_connection () cursor = Connection.cursor ()
                Try:if params is none:s = SQL else:s = (sql% params) cursor.execute (s) rows = Cursor.fetchall () for row in rows: 
            Print (row) except Exception as E:print (e) connection.rollback () Finally:
 Cursor.close () Connection.close ()

User data Access Class (user_dao.py)

#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import Db_helper


class Userdao:
    def __init__ (self):
        self.db = Db_helper. DBHelper ()

    def add_user (self, user):
        sql = ' INSERT IGNORE into Zhihu_user (user_id, photo, name, profession) v Alues ('%s ', '%s ', '%s ', '%s ') '
        params = (user.user_id, User.photo, User.Name, user.profession)
        Self.db.execute_update (Sql=sql, Params=params)

User Information Class (user_info.py)

#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "


" "Class UserInfo:
    def __init__ (self, user_id, name, photo, profession):
        self.user_id = user_id self.name
        = name
        Self.photo = Photo
        self.profession = Profession

    def __str__ (self): return
        ' UserInfo (name= ' + Self.name + ', photo= ' + Self.photo + ', profession= ' + self.profession + ') \ n '

    def __repr__ (self): return
        self. __STR__ ()

In Python the print in the output object is called the object's __str__ function

This small demo crawl is to know the user ID, avatar, name, career information

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More