Python crawler path-simple Web Capture upgrade (add multithreading support)

Source: Internet
Author: User

Reprint Self's blog: http://www.mylonly.com/archives/1418.html

#!/usr/bin/env python
#coding: Utf-8 #############################################################
# File Name:main.py
# author:mylonly
# Mail: [email protected]
# Created time:wed June 08:22:12 PM CST
#########################################################################


Import Re,urllib2,htmlparser,threading,queue,time

#各图集入口链接
Htmldoorlist = []
#包含图片的Hmtl链接
Htmlurllist = []
#图片Url链接Queue
imageurllist = queue.queue (0)
#捕获图片数量
Imagegetcount = 0
#已下载图片数量
Imagedownloadcount = 0
#每个图集的起始地址, used to determine termination
Nexthtmlurl = ' '
#本地保存路径
Localsavepath = '/data/1920x1080/'

#如果你想下你需要的分辨率的, please modify the REPLACE_STR, there are the following resolutions to choose 1920X1200,1980X1920,1680X1050,1600X900,1440X900,1366X768,1280X1024, 1024x768,1280x800
Replace_str = ' 1920x1080 '

Replaced_str = ' 960x600 '

#内页分析处理类
Class Imagehtmlparser (Htmlparser.htmlparser):
def __init__ (self):
Self.nexturl = ' '
Htmlparser.htmlparser.__init__ (self)
def handle_starttag (self,tag,attrs):
Global Imageurllist
if (tag = = ' img ' and Len (attrs) > 2):
if (attrs[0] = = (' id ', ' bigimg ')):
url = attrs[1][1]
url = url.replace (REPLACED_STR,REPLACE_STR)
Imageurllist.put (URL)
Global Imagegetcount
Imagegetcount = Imagegetcount + 1
Print URL
Elif (Tag = = ' A ' and Len (attrs) = = 4):
if (attrs[0] = = (' id ', ' pagenext ') and attrs[1] = = (' class ', ' Next '):
Global Nexthtmlurl
Nexthtmlurl = attrs[2][1];

#首页分析类
Class Indexhtmlparser (Htmlparser.htmlparser):
def __init__ (self):
Self.urllist = []
Self.index = 0
Self.nexturl = ' '
Self.taglist = [' Li ', ' a ']
Self.classlist = [' photo-list-padding ', ' pic ']
Htmlparser.htmlparser.__init__ (self)
def handle_starttag (self,tag,attrs):
if (tag = = Self.taglist[self.index]):
For attr in Attrs:
if (attr[1] = = Self.classlist[self.index]):
if (Self.index = = 0):
#第一层找到了
Self.index = 1
Else
#第二层找到了
Self.index = 0
Print Attrs[1][1]
Self.urlList.append (Attrs[1][1])
Break
Elif (Tag = = ' A '):
For attr in Attrs:
if (attr[0] = = ' id ' and attr[1] = = ' Pagenext '):
Self.nexturl = attrs[1][1]
print ' Nexturl: ', Self.nexturl
Break

#首页Hmtl解析器
Indexparser = Indexhtmlparser ()
#内页Html解析器
Imageparser = Imagehtmlparser ()

#根据首页得到所有入口链接
print ' Start Scan homepage ... '
Host = ' http://www.poco.cn '
Indexurl = '/vision.htx&index_type=hot&gid=-1#list '
while (Indexurl! = "):
print ' Crawling Web page: ', Host+indexurl
Request = Urllib2. Request (Host+indexurl)
Try
m = Urllib2.urlopen (Request)
con = M.read ()
Indexparser.feed (Con)
if (Indexurl = = Indexparser.nexturl):
Break
Else
Indexurl = Indexparser.nexturl
Except Urllib2. Urlerror,e:
Print E.reason

print ' Home scan completed, all Atlas links have been obtained: '
Htmldoorlist = Indexparser.urllist

#根据入口链接得到所有图片的url
Class Getimageurl (threading. Thread):
def __init__ (self):
Threading. Thread.__init__ (self)
def run (self):
for Door in htmldoorlist:
print ' starts getting the image address, the entry address is: ', door
Global Nexthtmlurl
Nexthtmlurl = '
while (door! = "):
print ' starts getting pictures from page%s ... '% (Host+door)
if (nexthtmlurl! = '):
Request = Urllib2. Request (Host+nexthtmlurl)
Else:
request = Urllib2. Request (Host+door)
Try:
m = urllib2.urlopen (request)
con = m.read ()
Imageparser.feed (con)
print ' The next page address is: ', Nexthtmlurl
if (door = = Nexthtmlurl):
break
except Urllib2. Urlerror,e:
Print E.reason
print ' All picture addresses have been obtained: ', imageurllist

Class GetImage (threading. Thread):
def __init__ (self):
Threading. Thread.__init__ (self)
def run (self):
Global Imageurllist
print ' Start downloading pictures ... '
while (True):
print ' Current number of captured images: ', Imagegetcount
print ' Downloaded number of images: ', Imagedownloadcount
Image = Imageurllist.get ()
print ' Download file path: ', image
Try
Cont = Urllib2.urlopen (image). Read ()
patter = ' [0-9]*\.jpg ';
Match = Re.search (patter,image);
If match:
print ' Downloading file: ', Match.group ()
filename = Localsavepath+match.group ()
f = open (filename, ' WB ')
F.write (cont)
F.close ()
Global Imagedownloadcount
Imagedownloadcount = Imagedownloadcount + 1
Else
print ' no match '
if (Imageurllist.empty ()):
Break
Except Urllib2. Urlerror,e:
Print E.reason
print ' file full download complete ... '

get = Getimageurl ()
Get.start ()
print ' Get Picture link thread start: '

Time.sleep (2)

Download = GetImage ()
Download.start ()
print ' Download picture chain thread Start: '

Python crawler path-simple Web Capture upgrade (increase multithreading support)

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.