The URLLIB2 module and the regular expression module are used. The following direct code:
[/code]
#!/usr/bin/env python
#-*-Coding:utf-8-*-
#通过urllib (2) Module downloads network content
Import Urllib,urllib2,gevent
#引入正则表达式模块, Time module
Import Re,time
From gevent Import Monkey
Monkey.patch_all ()
def geturllist (URL):
Url_list=[]
Print URL
s = Urllib2.urlopen (URL)
Text = S.read ()
#正则匹配, match the picture in it
html = Re.search (R ' <ol.*</ol> ', text, re. S
URLs = Re.finditer (R ' <p></p> ', Html.group (), Re. I)
For I in URLs:
Url=i.group (1). Strip () +str ("JPG")
Url_list.append (URL)
Return url_list
def download (down_url):
Name=str (Time.time ()) [: -3]+ "_" +re.sub ('. +?/', ', Down_url ')
Print Name
Urllib.urlretrieve (Down_url, "d:\\temp\\" +name)
Def getpageurl ():
Page_list = []
#进行列表页循环
For page in range (1,700):
Url= "http://jandan.net/ooxx/page-" +str (page) + "#comments"
#把生成的url加入到page_list中
Page_list.append (URL)
Print Page_list
Return page_list
if __name__ = = ' __main__ ':
Jobs = []
Pageurl = Getpageurl () [::-1]
#进行图片下载
For I in Pageurl:
for (Downurl) in Geturllist (i):
Jobs.append (gevent.spawn (download, downurl))
Gevent.joinall (Jobs)
[/code]
The program is not long only 45 lines, not too difficult, we can study, here I just, we can according to the principle of the development of other crawling procedures, hehe, I want to go ... I will not say more ~ ~