In the calendar year, Sun xiansen first downloaded the Tang tea App, and the book was cool, diligent, and difficult to read. Mr. Yin liangran, who had no white flowers in the government Government, learned how to use python to move toward the cloud step by step: when everything goes wrong, the system first sends a scouts to detect the terrain. This is also a common method of home page for Mr. Xiao crab: http://www.tangcha.tc/booksnew book mounting http://www.tangcha.tc/books/latest 82*5 = 410 ranking http://www.tangcha.tc/books/top 410 this period of recommendation published press and bibliography ranking find the starting point shooting first shot Ma take character edition start http://www.tangcha.tc/books/top page structure simple no change master over the strokes 3 s clock Solution
#! /Usr/bin/env python #-*-coding: UTF-8-*-# author: insunimport urllib2import re # obtain the e-book list, capture id, link, thumb_cover, title, authordef get_booklist (): url = 'HTTP: // www. tangcha. tc/books/top 'html = urllib2.urlopen (url ). read () reg = re. compile (R' <li class = "book-cell">. +? <Span> (.*?). </Span> '+'. +? <A href = "(.*?) "Class =" cell-item boxable "> '+'. +? . +? </Figure> '+'. +? <P class = "book-title"> (.*?) </P> '+'. +? <P class = "book-author"> (.*?) </P> ', re. S) groups = re. findall (reg, html) return groups
Obtain the e-book list, capture id, link, thumb_cover, title, author the id here is a false id. We save it as bid. The real id is href = "/books/906", and we save it as the integer behind it. Based on the rid, we get to the next page to get rid of it. powerful enemies are good at transformation and separation # href = "/books/906"/books/626/books/707/books/472 #/books/414 ##/books/429 and/books/424 <div class = "no-related-items"> <div class = "related-items"> after these thorns, the road is clear. The preliminary code is as follows:
#! /Usr/bin/env python
#-*-Coding: UTF-8 -*-
# Author: insun
Import urllib2
Import re
# Obtain the e-book list, capture id, link, thumb_cover, title, author
Def get_booklist ():
Url = 'HTTP: // www. tangcha. tc/books/top'
Html = urllib2.urlopen (url). read ()
Reg = re. compile (R' <li class = "book-cell">. +? <Span> (.*?). </Span> '+
'. +? <A href = "(.*?) "Class =" cell-item boxable "> '+
'. +? . +? </Figure> '+
'. +? <P class = "book-title"> (.*?) </P> '+
'. +? <P class = "book-author"> (.*?) </P> ', re. S)
Groups = re. findall (reg, html)
Return groups
# Obtain e-book details, capture cover, title, author, publisher, douban_rate, content, author_intro
Def get_bookdetail (href ):
Detailurl = 'HTTP: // www. tangcha. tc '+ href
Detailhtml = urllib2.urlopen (detailurl). read ()
If re. search ('book-publisher book-info-entry ', detailhtml )! = None:
Publish = '. +? <P class = "book-publisher book-info-entry">. +? <A. +?> (.*?) </A>. +? </P>. +? <P class = "book-device">. +? '
Else:
Publish = '(.*?) <P class = "book-device">'
If re. search ('douban-rating-number', detailhtml )! = None:
Douban = '. +? <Div class = "douban-rating-number"> (.*?) </Div> </a>'
Description = '(.*?) <Div class = ". +? ">'
Else:
Douban = '(.*?) '
Description = '. +? <Section id = "book-description"> (.*?) <Div class = ". +? ">'
Dreg = re. compile (R' <figure class = "book-cover">. +? . +? </Figure> '+
'. +? <P class = "book-title"> (.*?) </P> '+
'. +? <P class = "book-author book-info-entry">. +? <A. +?> (.*?) </A>. +? </P> '+
Publish + douban + description, re. S)
Dgroups = re. findall (dreg, detailhtml)
Return dgroups
For I in get_booklist ():
Href = I [1]
Details = get_bookdetail (href)
Print details [0]
To save the data to the bag, we introduce mongodb to install it ourselves. We have also introduced the python for MongoDB getting started.
Top
#! /Usr/bin/env python
#-*-Coding: UTF-8 -*-
# Author: insun
Import urllib
Import urllib2, sys
Import re
Import pymongo
Import OS
Db = pymongo. Connection (). test
If (OS. path. exists ('thumb') = False ):
OS. mkdir ('thumb ')
If (OS. path. exists ('cover') = False ):
OS. mkdir ('cover ')
# New Book mounting http://www.tangcha.tc/books/latest 82*5 = 410
# Ranking http://www.tangcha.tc/books/top 410
# Current recommendation http://www.tangcha.tc/books/recommendation
# Obtain the e-book list, capture id, link, thumb_cover, title, author
Def get_booklist ():
Url = 'HTTP: // www. tangcha. tc/books/top'
Html = urllib2.urlopen (url). read ()
Reg = re. compile (R' <li class = "book-cell">. +? <Span> (.*?). </Span> '+
'. +? <A href = "(.*?) "Class =" cell-item boxable "> '+
'. +? . +? </Figure> '+
'. +? <P class = "book-title"> (.*?) </P> '+
'. +? <P class = "book-author"> (.*?) </P> ', re. S)
Groups = re. findall (reg, html)
Return groups
# Obtain e-book details, capture cover, title, author, publisher, douban_rate, content, author_intro
Def get_bookdetail (href ):
Detailurl = 'HTTP: // www. tangcha. tc '+ href
Detailhtml = urllib2.urlopen (detailurl). read ()
If re. search ('book-publisher book-info-entry ', detailhtml )! = None:
Publish = '. +? <P class = "book-publisher book-info-entry">. +? <A. +?> (.*?) </A>. +? </P>. +? <P class = "book-device">. +? '
Else:
Publish = '(.*?) <P class = "book-device">'
If re. search ('douban-rating-number', detailhtml )! = None:
Douban = '. +? <Div class = "douban-rating-number"> (.*?) </Div> </a>'
Description = '(.*?) <Div class = ". +? ">'
Else:
Douban = '(.*?) '
Description = '. +? <Section id = "book-description"> (.*?) <Div class = ". +? ">'
Dreg = re. compile (R' <figure class = "book-cover">. +? . +? </Figure> '+
'. +? <P class = "book-title"> (.*?) </P> '+
'. +? <P class = "book-author book-info-entry">. +? <A. +?> (.*?) </A>. +? </P> '+
Publish + douban + description, re. S)
Dgroups = re. findall (dreg, detailhtml)
Return dgroups
# Download size cover image
Def download_cover (thumb, cover, rid ):
Urllib. urlretrieve (thumb, 'thumb/'+ str (rid) + "_thumb.jpg ")
Urllib. urlretrieve (cover, 'Cover/'+ str (rid) + "_cover.jpg ")
# Bid is a false id. rid is a real id.
For I in get_booklist ():
Href = I [1]
Print href
Rid = int (href. replace ("/books /",""))
Details = get_bookdetail (href)
# IndexError: list index out of range
Details = details [0]
Douban_rate = details [4]
Values = dict (
Bid = I [0],
Rid = rid,
Thumb = I [2],
Title = I [3],
Author = I [4],
Publisher = details [3],
Cover = details [0],
Douban_rate = douban_rate,
Content = details [5],
)
Db. books. save (values)
Download_cover (I [2], details [0], rid)
Print I [3] + 'Download and save success'
Latest:
#! /Usr/bin/env python
#-*-Coding: UTF-8 -*-
# Author: insun
Import urllib
Import urllib2, sys
Import re
Import pymongo
Import OS
Db = pymongo. Connection (). test
If (OS. path. exists ('thumb') = False ):
OS. mkdir ('thumb ')
If (OS. path. exists ('cover') = False ):
OS. mkdir ('cover ')
# New Book mounting http://www.tangcha.tc/books/latest 82*5 = 410
# Ranking http://www.tangcha.tc/books/top 410
# Current recommendation http://www.tangcha.tc/books/recommendation
# Obtain the e-book list, capture id, link, thumb_cover, title, author
Def get_booklist ():
Url = 'HTTP: // www. tangcha. tc/books/latest'
Html = urllib2.urlopen (url). read ()
Reg = re. compile (R' <li class = "book-cell">. +? <A href = "(.*?) "Class =" cell-item boxable "> '+
'. +? . +? </Figure> '+
'. +? <P class = "book-title"> (.*?) </P> '+
'. +? <P class = "book-author"> (.*?) </P> ', re. S)
Groups = re. findall (reg, html)
Return groups
# Obtain e-book details, capture cover, title, author, publisher, douban_rate, content, author_intro
Def get_bookdetail (href ):
# Href = "/books/906"/books/626/books/707/books/472
#/Books/429
#/Books/414 #/Books/596 no <p class = "book-publisher book-info-entry">
#/Books/424 <div class = "no-related-items"> <div class = "related-items">
Detailurl = 'HTTP: // www. tangcha. tc '+ href
Detailhtml = urllib2.urlopen (detailurl). read ()
If re. search ('book-publisher book-info-entry ', detailhtml )! = None:
Publish = '. +? <P class = "book-publisher book-info-entry">. +? <A. +?> (.*?) </A>. +? </P>. +? <P class = "book-device">. +? '
Else:
Publish = '(.*?) <P class = "book-device">'
If re. search ('douban-rating-number', detailhtml )! = None:
Douban = '. +? <Div class = "douban-rating-number"> (.*?) </Div> </a>'
Description = '(.*?) <Div class = ". +? ">'
Else:
Douban = '(.*?) '
Description = '. +? <Section id = "book-description"> (.*?) <Div class = ". +? ">'
Dreg = re. compile (R' <figure class = "book-cover">. +? . +? </Figure> '+
'. +? <P class = "book-title"> (.*?) </P> '+
'. +? <P class = "book-author book-info-entry">. +? <A. +?> (.*?) </A>. +? </P> '+
Publish + douban + description, re. S)
Dgroups = re. findall (dreg, detailhtml)
Return dgroups
# Download size cover image
Def download_cover (thumb, cover, rid ):
Urllib. urlretrieve (thumb, 'thumb/'+ str (rid) + "_thumb.jpg ")
Urllib. urlretrieve (cover, 'Cover/'+ str (rid) + "_cover.jpg ")
# Bid is a false id. rid is a real id.
For I in get_booklist ():
Href = I [0]
Print href
Rid = int (href. replace ("/books /",""))
Details = get_bookdetail (href)
# IndexError: list index out of range
Details = details [0]
Douban_rate = details [4]
Values = dict (
Bid = 0,
Rid = rid,
Thumb = I [1],
Title = I [2],
Author = I [3],
Publisher = details [3],
Cover = details [0],
Douban_rate = douban_rate,
Content = details [5],
)
Db. books. save (values)
Download_cover (I [1], details [0], rid)
Print I [2] + 'Download and save success'
Added the download cover image and saved all the collected treasures to mongodb to help you move to the next step. At this point, the backpack is full. How can we reconstruct the free-type website? For more information, please let us know.