[python] book information crawler sample
Background Notes
Need to collect some book information, in watercress book entries as the source, get some valid book information, and save to the local database. get a book category label
Refer to this link:
Https://book.douban.com/tag/?view=type
These category label links are then saved to a local file, and the contents are stored as follows
https://book.douban.com/tag/novels
https://book.douban.com/tag/foreign literature
https://book.douban.com/tag/literature
https://book.douban.com/tag/essay
https://book.douban.com/tag/Chinese literature
https://book.douban.com/tag/Classics
https://book.douban.com/tag/Japanese literature
https://book.douban.com/tag/prose
https://book.douban.com/tag/village Upper Spring tree
https://book.douban.com/tag/Poems
https://book.douban.com/tag/fairy tales ...
Get book information and save the local database
Assume that the MySQL table has been built, as follows:
CREATE TABLE ' book_info ' (' id ' int (one) not null auto_increment, ' bookid ' varchar (=) NOT null COMMENT ' book ID ', ' t AG ' varchar ' COMMENT ' classified directory ', ' bookname ' varchar (256) not NULL COMMENT ' title ', ' subname ' varchar (256) Not N
Ull COMMENT ' class two title ', ' Author ' varchar (256) Default ' COMMENT ' author ', ' Translator ' varchar (256) Default ' COMMENT ' translator ', ' Press ' varchar (128) Default ' COMMENT ' publishers ', ' publishat ' date DEFAULT ' 0000-00-00 ' COMMENT ' publication date ', ' stars ' float D Efault ' 0 ' COMMENT ' rating ', ' price_str ' varchar (#) Default ' COMMENT ' Price string ', ' hotcnt ' int (one) default ' 0 ' COMMENT ' Number of comments ', ' bookdesc ' varchar (8192) DEFAULT null COMMENT ' profile ', ' updateat ' timestamp not NULL DEFAULT current_timestamp on Update Current_timestamp COMMENT ' modified Date ', PRIMARY key (' id '), UNIQUE key ' Idx_bookid ' (' BookID '), key ' Idx_bookname ' (' BookName '), key ' hotcnt ' (' hotcnt '), Key ' stars ' (' Stars '), Key ' Idx_tag ' (' tag ') Engine=innodb DEFAULT charset= UTF8 comment= ' book Information ';
and has implemented the relevant crawler logic, mainly using the BeautifulSoup package, as follows:
#!/usr/bin/python # coding:utf-8 Import re import logging import requests import Pymysql import random import time Impor
t datetime from hashlib import MD5 from BS4 import BeautifulSoup logging.basicconfig (Level=logging.info, format= ' [% (levelname) s][% (name) s][% (asctime) s]% (message) s ', datefmt= '%y-%m-%d%h:%m:%s ') Class D Estdb:host = "192.168.1.10" DB = "spider" Table = "book_info" User = "Test" PWD = "123456" def Conne
ct_db (host, DB, User, PWD): conn = Pymysql.connect (Host=host, User=user, Passwd=pwd, db=db, charset= ' UTF8 ', connect_timeout=3600) # Cursorclass=pymysql.cursors.dictcursor. Autocommit (True) return conn def disconnect_db (conn, cursor): Cursor.close () conn.close () #提取评价人数, if the number of reviewers is less than 10 person, according to 10 people processing def hotratings (person): Try:ptext = Person.get_text (). Split () [0] pc = Int (Ptext[1:len (ptext) -4]) except Valueerror:pc = Int (ten) Return PC # Persistent to database def save_to_db (tag, book_reslist): Dest_conn = connect_db (destdb.hos
T, Destdb.db, Destdb.user, destdb.pwd) dest_cursor = Dest_conn.cursor () isql = "Insert ignore into Book_info" isql + = "(' BookID ', ' tag ', ' Author ', ' Translator ', ' bookname ', ' subname ', ' press '," isql + "' publishat ', ' price_str ', ' STA
Rs ', ' hotcnt ', ' bookdesc ') values "isql + =". Join (["(%s)"% ",". Join (['%s ']*12)]*len (book_reslist)) values = [] For row in Book_reslist: # temporarily MD5 (Bookname+author) as BookID only refers to BookID = MD5 (("%s_%s"% (row[0],row[2)). E
Ncode (' Utf-8 ')). Hexdigest () values.extend ([BookID, Tag]+row[:10]) dest_cursor.execute (isql, tuple (values)) disconnect_db (Dest_conn, dest_cursor) # Processing every visit to the page def do_parse (tag, url): page_data = requests.get (URL) soup = BeautifulSoup (Page_data.text.encode ("Utf-8"), "lxml") # extract tag information tag = Url.split ("?") [0].split ("/") [-1] # crawl Author, publisher information details = soup.Select ("#subject_list > Ul > li > Div.info > div.pub") # Grab Score scores = Soup.select ("#subject_list > UL > li > Div.info > Div.star.clearfix > Span.rating_nums ") # Capture Evaluation Number persons = Soup.select (" #subject_ List > ul > li > Div.info > Div.star.clearfix > span.pl ") # Grab title Booknames = Soup.select (" #subject_ List > ul > li > Div.info > H2 > A ") # Crawl Profile Descs = Soup.select (" #subject_list > Ul > li " ; Div.info > P ") # Separate content from label information book_reslist = [] for detail, score, personcnt, BookName, desc in zip (details, Scores, persons, Booknames, Descs): Try:subtitle = "" Title_strs = [S.replace (' \ n ', ']. s
Trip ()-S in bookname.strings] title_strs = [s for s title_strs if S] # some books have two titles If not title_strs:continue elif len (title_strs) >= 2:bookname, subtitle
= Title_strs[:2] Else:bookname = title_strs[0] # number of ratings hotcnt = Hotratings (personcnt)
desc = Desc.get_text () stars = float ('%.1f '% float (score.get_text () if Score.get_text () Else "-1")) Author, translator, press, publishat, price = [""]*5 detail_texts = Detail.get_text (). replace (' \ n ' , '). Split ("/") detail_texts = [S.strip () for S in Detail_texts] # part book without translator information If Len ( detail_texts) = = 4:author, press, publishat, price = Detail_texts[:4] Elif len (detail_texts)
>= 5:author, translator, press, publishat, price = Detail_texts[:5] Else: Continue # Conversion Publication date is date type if Re.match (' ^[\d]{4}-[\d]{1,2} ', publishat): DTS = Publ
Ishat.split ('-') Publishat = datetime.date (int (dts[0)), int (dts[1)), 1 else: Publishat = Datetime.datE (1000, 1, 1) book_reslist.append ([Author, translator, BookName, subtitle, press, Publishat, Price, stars, HOTCNT, Desc]) except Exception as E:logging.error (e) logging.in
Fo ("Insert count:%d"% len (book_reslist)) If Len (book_reslist) > 0:save_to_db (tag, book_reslist)
Book_reslist = [] return len (Details) def main (): With open ("Book_tags.txt") as Fd:tags = Fd.readlines ()
For tag in Tags:tag = Tag.strip () logging.info ("The current tag URL:%s"% tag) For IDX in range (0, 1000000): Try:url = "%s?start=%d&type=t"% (Tag.strip (),
IDX) cnt = Do_parse (Tag.split ('/') [-1], URL) if CNT < 10: Break # Sleep for several seconds, reducing access frequency time.sleep (random.randint) except
Exception as E: Logging.warn ("Outer_err:%s"% E) time.sleep (+) If __name__ = = "__main__": Main ()
Summary
The above code is based on the PYTHON3 environment to run;
You need to first install Beautifulsoup:pip install BS4
In the process of crawling, it is necessary to control the frequency of access;
Some information needs to be handled abnormally, such as translator information, number of reviewers, etc.