Python captures Douban images and automatically saves the example for learning. The example uses the beautifulsoup library to analyze the HTML code. beautifulsoup is an HTMLXML parser that can be used for web crawler environments Python 2.7.6, BS4, it can be run in powershell or command line. Make sure that the BS module is installed.
The code is as follows:
#-*-Coding: utf8 -*-
#2013.12.36 19:41 wnlo-c209
# Capture the image of dbmei.com.
From bs4 import BeautifulSoup
Import OS, sys, urllib2
# Create a folder, just learned yesterday
Path = OS. getcwd () # obtain the directory where the script is located
New_path = OS. path. join (path, u 'doubanque ')
If not OS. path. isdir (new_path ):
OS. mkdir (new_path)
Def page_loop (page = 0 ):
Url = 'http: // www.dbmeizi.com /? P = % s' % page
Content = urllib2.urlopen (url)
Soup = BeautifulSoup (content)
My_girl = soup. find_all ('IMG ')
# Added the end detection, which is hard to write ....
If my_girl = []:
Print U' all captured has been completed'
Sys. exit (0)
Print u'start grabbing'
For girl in my_girl:
Link = girl. get ('src ')
Flink = 'http: // www.dbmeizi.com/'+ link
Print flink
Content2 = urllib2.urlopen (flink). read ()
With open (u'watercress sister '+'/'+ flink [-11:], 'wb') as code: # learned on OSC
Code. write (content2)
Page = int (page) + 1
Print U' start to capture the next page'
Print the % s page '% page
Page_loop (page)
Page_loop ().