This article mainly explains how Python implements the simple crawler process
Open seed url--> Get all the url--> in the seed URL page to determine if it has been crawled, and the crawled URL is added to the URL list--> the information required in the parse page--> write to the database
5 objects can be abstracted from the above process: Bootstrap Initiator Downloader downloader parser parser url_manager URL manager outputer output Initiator (main.py)
First look at how the launcher is implemented:
#!/usr/bin/env python # _*_ coding:utf-8 _*_ "" Created on 2017/9/23 @author: TT "" "Import spider_outputer import Spide R_parser Import spider_downloader Import Spider_url_manager class Main:def __init__ (self): Self.urls = SPI Der_url_manager. Urlmanager () Self.downloader = Spider_downloader. Downloader () Self.outputer = Spider_outputer. Outputer () Self.parser = Spider_parser. Parser () def craw (self, root_url): Self.urls.add_new_url (root_url) Count = 1 while self.urls. Has_new_url (): Try:new_url = Self.urls.get_new_url () print (' Craw%d:%s '% ( Count, new_url)) Html_content = Self.downloader.download (new_url) new_urls, New_data = sel F.parser.parser (New_url, html_content) self.urls.add_new_urls (new_urls) self.outputer.coll
Ect_data (new_data) count = 1 except Exception as E: Print (e) if __name__ = = "__main__": url = ' https://www.zhihu.com/people/li-xiao-miao-70/activities ' main = Mai
N () main.craw (URL)
Download (spider_downloader.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import String from
urllib import request from
urllib.parse Import quote
user_agent = ' mozilla/5.0 (Windows NT 6.1; Win64; x64) applewebkit/537.36 '
class Downloader:
def download (self, URL):
If URL is none: Return
None
_url = quote (URL, safe=string.printable)
req = Request. Request (_url, headers={' user-agent ': user_agent})
response = Request.urlopen (req)
if Response.getcode ()!=
print ("Request Bad") return to
None
html = response.read () return
html.decode (' UTF8 ')
parser (spider_parser.py)
#!/usr/bin/env python #-*-coding:utf-8-*-"" Created on 2017/9/23 @author: TT "" "' from Urllib import parse from BS4 Import BeautifulSoup Import User_info class Parser:def Parser (self, Page_url, html_content): If Page_url I S-None or html_content is none:return soup = BeautifulSoup (html_content, "Html.parser") New_u RLS = Self._get_new_urls (page_url, soup) new_data = Self._get_new_data (page_url, soup) return new_urls, NE W_data def _get_new_urls (self, Page_url, soup): New_urls = set () links = soup.find_all (' a ') f
or link in links:new_url = link.get (' href ') New_full_url = Parse.urljoin (Page_url, New_url) New_urls.add (New_full_url) return New_urls def _get_new_data (self, Page_url, soup): Ret_data = [] Author_infos = Soup.find_all (class_= ' Authorinfo ') print (len (Author_infos)) for Author_info in
Author_infos: user_id = Author_info.find (class_= ' Userlink-link '). Get (' href ') user_id = User_id[user_id.find (' e/') + 2:]
Photo = Author_info.find (class_= ' Authorinfo-avatar '). Get (' Srcset ') pos = Photo.find (') Photo = photo[:p os] name = Author_info.find (' meta ', attrs={' itemprop ': ' Name '}). Get (' content ') pr Ofession = Author_info.find (class_= ' Authorinfo-badgetext '). Get_text () user = User_info. UserInfo (user_id, name, photo, profession) ret_data.append (user) return Ret_data
URL Manager (spider_url_manager.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Class Urlmanager:
def __init__ (self):
Self.visited_url = set ()
Self.visit_url = set ()
def add_new_url ( Self, URL):
If URL is None: ' return if ' url not in Self.visit_url and URL not in
Self.visited_url:
self. Visit_url.add (URL)
def has_new_url (self): return
len (self.visit_url)!= 0
def add_new_urls (self, URLs ):
If URL is None or len (urls) = 0: Return to
URL in
URLs:
self.add_new_url (URLs)
def get_new_ URL (self):
New_url = Self.visit_url.pop ()
Self.visited_url.add (new_url) return
New_url
output Device (spider_outputer.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import User_dao
class Outputer:
def __init__ (self):
Self.user_dao = User_dao. Userdao ()
def collect_data (self, data):
If data is none:
print (' None ')
return
else:
For d in data:
Self.user_dao.add_user (d)
Other Classes
database Help Class (db_helper.py)
#!/usr/bin/env python #-*-coding:utf-8-*-"" Created on 2017/9/23 @author: TT "" "Import Pymysql class DBHelper: def _get_connection (self): return pymysql. Connect (host= ' 127.0.0.1 ', user= ' root ', passwd= ' root ', db= ' crawler ', port=3306, charset= ' UTF8 ') def execute_update (sel f, SQL, params): Connectin = self._get_connection () cursor = Connectin.cursor () try:i F params is none:s = SQL else:s = (sql% params) result = Cursor . Execute (s) connectin.commit () print (' Rows: ', result) except Exception as E:p Rint (e) connectin.rollback () Finally:cursor.close () connectin.close () de
F Query (self, SQL, params=none): Connection = self._get_connection () cursor = Connection.cursor ()
Try:if params is none:s = SQL else:s = (sql% params) cursor.execute (s) rows = Cursor.fetchall () for row in rows:
Print (row) except Exception as E:print (e) connection.rollback () Finally:
Cursor.close () Connection.close ()
User data Access Class (user_dao.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Import Db_helper
class Userdao:
def __init__ (self):
self.db = Db_helper. DBHelper ()
def add_user (self, user):
sql = ' INSERT IGNORE into Zhihu_user (user_id, photo, name, profession) v Alues ('%s ', '%s ', '%s ', '%s ') '
params = (user.user_id, User.photo, User.Name, user.profession)
Self.db.execute_update (Sql=sql, Params=params)
User Information Class (user_info.py)
#!/usr/bin/env python
#-*-coding:utf-8-*-
""
Created on 2017/9/23
@author: tt "
" "Class UserInfo:
def __init__ (self, user_id, name, photo, profession):
self.user_id = user_id self.name
= name
Self.photo = Photo
self.profession = Profession
def __str__ (self): return
' UserInfo (name= ' + Self.name + ', photo= ' + Self.photo + ', profession= ' + self.profession + ') \ n '
def __repr__ (self): return
self. __STR__ ()
In Python the print in the output object is called the object's __str__ function
This small demo crawl is to know the user ID, avatar, name, career information