Reprint Self's blog: http://www.mylonly.com/archives/1418.html
#!/usr/bin/env python
#coding: Utf-8 #############################################################
# File Name:main.py
# author:mylonly
# Mail: [email protected]
# Created time:wed June 08:22:12 PM CST
#########################################################################
Import Re,urllib2,htmlparser,threading,queue,time
#各图集入口链接
Htmldoorlist = []
#包含图片的Hmtl链接
Htmlurllist = []
#图片Url链接Queue
imageurllist = queue.queue (0)
#捕获图片数量
Imagegetcount = 0
#已下载图片数量
Imagedownloadcount = 0
#每个图集的起始地址, used to determine termination
Nexthtmlurl = ' '
#本地保存路径
Localsavepath = '/data/1920x1080/'
#如果你想下你需要的分辨率的, please modify the REPLACE_STR, there are the following resolutions to choose 1920X1200,1980X1920,1680X1050,1600X900,1440X900,1366X768,1280X1024, 1024x768,1280x800
Replace_str = ' 1920x1080 '
Replaced_str = ' 960x600 '
#内页分析处理类
Class Imagehtmlparser (Htmlparser.htmlparser):
def __init__ (self):
Self.nexturl = ' '
Htmlparser.htmlparser.__init__ (self)
def handle_starttag (self,tag,attrs):
Global Imageurllist
if (tag = = ' img ' and Len (attrs) > 2):
if (attrs[0] = = (' id ', ' bigimg ')):
url = attrs[1][1]
url = url.replace (REPLACED_STR,REPLACE_STR)
Imageurllist.put (URL)
Global Imagegetcount
Imagegetcount = Imagegetcount + 1
Print URL
Elif (Tag = = ' A ' and Len (attrs) = = 4):
if (attrs[0] = = (' id ', ' pagenext ') and attrs[1] = = (' class ', ' Next '):
Global Nexthtmlurl
Nexthtmlurl = attrs[2][1];
#首页分析类
Class Indexhtmlparser (Htmlparser.htmlparser):
def __init__ (self):
Self.urllist = []
Self.index = 0
Self.nexturl = ' '
Self.taglist = [' Li ', ' a ']
Self.classlist = [' photo-list-padding ', ' pic ']
Htmlparser.htmlparser.__init__ (self)
def handle_starttag (self,tag,attrs):
if (tag = = Self.taglist[self.index]):
For attr in Attrs:
if (attr[1] = = Self.classlist[self.index]):
if (Self.index = = 0):
#第一层找到了
Self.index = 1
Else
#第二层找到了
Self.index = 0
Print Attrs[1][1]
Self.urlList.append (Attrs[1][1])
Break
Elif (Tag = = ' A '):
For attr in Attrs:
if (attr[0] = = ' id ' and attr[1] = = ' Pagenext '):
Self.nexturl = attrs[1][1]
print ' Nexturl: ', Self.nexturl
Break
#首页Hmtl解析器
Indexparser = Indexhtmlparser ()
#内页Html解析器
Imageparser = Imagehtmlparser ()
#根据首页得到所有入口链接
print ' Start Scan homepage ... '
Host = ' http://www.poco.cn '
Indexurl = '/vision.htx&index_type=hot&gid=-1#list '
while (Indexurl! = "):
print ' Crawling Web page: ', Host+indexurl
Request = Urllib2. Request (Host+indexurl)
Try
m = Urllib2.urlopen (Request)
con = M.read ()
Indexparser.feed (Con)
if (Indexurl = = Indexparser.nexturl):
Break
Else
Indexurl = Indexparser.nexturl
Except Urllib2. Urlerror,e:
Print E.reason
print ' Home scan completed, all Atlas links have been obtained: '
Htmldoorlist = Indexparser.urllist
#根据入口链接得到所有图片的url
Class Getimageurl (threading. Thread):
def __init__ (self):
Threading. Thread.__init__ (self)
def run (self):
for Door in htmldoorlist:
print ' starts getting the image address, the entry address is: ', door
Global Nexthtmlurl
Nexthtmlurl = '
while (door! = "):
print ' starts getting pictures from page%s ... '% (Host+door)
if (nexthtmlurl! = '):
Request = Urllib2. Request (Host+nexthtmlurl)
Else:
request = Urllib2. Request (Host+door)
Try:
m = urllib2.urlopen (request)
con = m.read ()
Imageparser.feed (con)
print ' The next page address is: ', Nexthtmlurl
if (door = = Nexthtmlurl):
break
except Urllib2. Urlerror,e:
Print E.reason
print ' All picture addresses have been obtained: ', imageurllist
Class GetImage (threading. Thread):
def __init__ (self):
Threading. Thread.__init__ (self)
def run (self):
Global Imageurllist
print ' Start downloading pictures ... '
while (True):
print ' Current number of captured images: ', Imagegetcount
print ' Downloaded number of images: ', Imagedownloadcount
Image = Imageurllist.get ()
print ' Download file path: ', image
Try
Cont = Urllib2.urlopen (image). Read ()
patter = ' [0-9]*\.jpg ';
Match = Re.search (patter,image);
If match:
print ' Downloading file: ', Match.group ()
filename = Localsavepath+match.group ()
f = open (filename, ' WB ')
F.write (cont)
F.close ()
Global Imagedownloadcount
Imagedownloadcount = Imagedownloadcount + 1
Else
print ' no match '
if (Imageurllist.empty ()):
Break
Except Urllib2. Urlerror,e:
Print E.reason
print ' file full download complete ... '
get = Getimageurl ()
Get.start ()
print ' Get Picture link thread start: '
Time.sleep (2)
Download = GetImage ()
Download.start ()
print ' Download picture chain thread Start: '
Python crawler path-simple Web Capture upgrade (increase multithreading support)