Python crawler path-simple Web Capture upgrade (add multithreading support)

Last Update:2014-11-24 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Reprint Self's blog: http://www.mylonly.com/archives/1418.html

#!/usr/bin/env python
#coding: Utf-8 #############################################################
# File Name:main.py
# author:mylonly
# Mail: [email protected]
# Created time:wed June 08:22:12 PM CST
#########################################################################

Import Re,urllib2,htmlparser,threading,queue,time

#各图集入口链接
Htmldoorlist = []
#包含图片的Hmtl链接
Htmlurllist = []
#图片Url链接Queue
imageurllist = queue.queue (0)
#捕获图片数量
Imagegetcount = 0
#已下载图片数量
Imagedownloadcount = 0
#每个图集的起始地址, used to determine termination
Nexthtmlurl = ' '
#本地保存路径
Localsavepath = '/data/1920x1080/'

#如果你想下你需要的分辨率的, please modify the REPLACE_STR, there are the following resolutions to choose 1920X1200,1980X1920,1680X1050,1600X900,1440X900,1366X768,1280X1024, 1024x768,1280x800
Replace_str = ' 1920x1080 '

Replaced_str = ' 960x600 '

#内页分析处理类
Class Imagehtmlparser (Htmlparser.htmlparser):
def __init__ (self):
Self.nexturl = ' '
Htmlparser.htmlparser.__init__ (self)
def handle_starttag (self,tag,attrs):
Global Imageurllist
if (tag = = ' img ' and Len (attrs) > 2):
if (attrs[0] = = (' id ', ' bigimg ')):
url = attrs[1][1]
url = url.replace (REPLACED_STR,REPLACE_STR)
Imageurllist.put (URL)
Global Imagegetcount
Imagegetcount = Imagegetcount + 1
Print URL
Elif (Tag = = ' A ' and Len (attrs) = = 4):
if (attrs[0] = = (' id ', ' pagenext ') and attrs[1] = = (' class ', ' Next '):
Global Nexthtmlurl
Nexthtmlurl = attrs[2][1];

#首页分析类
Class Indexhtmlparser (Htmlparser.htmlparser):
def __init__ (self):
Self.urllist = []
Self.index = 0
Self.nexturl = ' '
Self.taglist = [' Li ', ' a ']
Self.classlist = [' photo-list-padding ', ' pic ']
Htmlparser.htmlparser.__init__ (self)
def handle_starttag (self,tag,attrs):
if (tag = = Self.taglist[self.index]):
For attr in Attrs:
if (attr[1] = = Self.classlist[self.index]):
if (Self.index = = 0):
#第一层找到了
Self.index = 1
Else
#第二层找到了
Self.index = 0
Print Attrs[1][1]
Self.urlList.append (Attrs[1][1])
Break
Elif (Tag = = ' A '):
For attr in Attrs:
if (attr[0] = = ' id ' and attr[1] = = ' Pagenext '):
Self.nexturl = attrs[1][1]
print ' Nexturl: ', Self.nexturl
Break

#首页Hmtl解析器
Indexparser = Indexhtmlparser ()
#内页Html解析器
Imageparser = Imagehtmlparser ()

#根据首页得到所有入口链接
print ' Start Scan homepage ... '
Host = ' http://www.poco.cn '
Indexurl = '/vision.htx&index_type=hot&gid=-1#list '
while (Indexurl! = "):
print ' Crawling Web page: ', Host+indexurl
Request = Urllib2. Request (Host+indexurl)
Try
m = Urllib2.urlopen (Request)
con = M.read ()
Indexparser.feed (Con)
if (Indexurl = = Indexparser.nexturl):
Break
Else
Indexurl = Indexparser.nexturl
Except Urllib2. Urlerror,e:
Print E.reason

print ' Home scan completed, all Atlas links have been obtained: '
Htmldoorlist = Indexparser.urllist

#根据入口链接得到所有图片的url
Class Getimageurl (threading. Thread):
def __init__ (self):
Threading. Thread.__init__ (self)
def run (self):
for Door in htmldoorlist:
print ' starts getting the image address, the entry address is: ', door
Global Nexthtmlurl
Nexthtmlurl = '
while (door! = "):
print ' starts getting pictures from page%s ... '% (Host+door)
if (nexthtmlurl! = '):
Request = Urllib2. Request (Host+nexthtmlurl)
Else:
request = Urllib2. Request (Host+door)
Try:
m = urllib2.urlopen (request)
con = m.read ()
Imageparser.feed (con)
print ' The next page address is: ', Nexthtmlurl
if (door = = Nexthtmlurl):
break
except Urllib2. Urlerror,e:
Print E.reason
print ' All picture addresses have been obtained: ', imageurllist

Class GetImage (threading. Thread):
def __init__ (self):
Threading. Thread.__init__ (self)
def run (self):
Global Imageurllist
print ' Start downloading pictures ... '
while (True):
print ' Current number of captured images: ', Imagegetcount
print ' Downloaded number of images: ', Imagedownloadcount
Image = Imageurllist.get ()
print ' Download file path: ', image
Try
Cont = Urllib2.urlopen (image). Read ()
patter = ' [0-9]*\.jpg ';
Match = Re.search (patter,image);
If match:
print ' Downloading file: ', Match.group ()
filename = Localsavepath+match.group ()
f = open (filename, ' WB ')
F.write (cont)
F.close ()
Global Imagedownloadcount
Imagedownloadcount = Imagedownloadcount + 1
Else
print ' no match '
if (Imageurllist.empty ()):
Break
Except Urllib2. Urlerror,e:
Print E.reason
print ' file full download complete ... '

get = Getimageurl ()
Get.start ()
print ' Get Picture link thread start: '

Time.sleep (2)

Download = GetImage ()
Download.start ()
print ' Download picture chain thread Start: '

Python crawler path-simple Web Capture upgrade (increase multithreading support)

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python crawler path-simple Web Capture upgrade (add multithreading support)

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python crawler path-simple Web Capture upgrade (add multithreading support)

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support