Crawlers crawl pictures of Baidu Post bars. Incomplete. Please advise.
The Code is as follows: Use Python3.5
Certificate -----------------------------------------------------------------------------------------------------------------------------------------------------
Import urllib. request
Import re
Import OS
# Open a webpage
Def url_open (url ):
Req = urllib. request. Request (url)
Req. add_header ('user-agent', 'mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/100 ')
Response = urllib. request. urlopen (url)
Html = response. read ()
Return html
# Download and save images
Def download (urls ):
For link in urls:
Html = url_open (link). decode ('utf-8 ')
P = R' Result = re. findall (p, html)
For I in result:
Filename = I. split ('/') [-1]
Urllib. request. urlretrieve (I, filename, None)
# Crawling the links of all posts on this page
Def findlink (url ):
Html = url_open (url). decode ('utf-8 ')
P = R' <a href = "/p/\ d + "'
Result = re. findall (p, html)
Urls = []
For I in result:
Newurl = 'HTTP: // tieba.baidu.com/'+ I. split ('"') [-2]
Urls. append (newurl)
Download (urls)
# Creating folders
Def start (folder = 'picture '):
OS. mkdir (folder)
OS. chdir (folder)
Findlink (url)
# Name = urllib. parse. quote (input ('Enter the post name :'))
# Num = input ('Enter the page number, 0 indicates the first page, 50 indicates the second page, 100 indicates the third page, and so on :')
# You can replace the Post URL as needed. Note that the URL here has been processed to facilitate page selection.
# Url = 'HTTP: // tieba.baidu.com/f? Kw = '+ name +' & ie = UTF-8 & pn = '+ 'num'
Url = 'HTTP: // tieba.baidu.com/f? Kw = % E6 % 9D % A8 % E4 % B8 % 9E % E7 % 90% B3 & ie = UTF-8 & pn = 100'
# Run the script
Start ()
Certificate -----------------------------------------------------------------------------------------------------------------------------------------------------
I have a lot of questions. Continue to learn and try again. If you are passing by, please kindly advise. Thank you very much.