Multithreaded crawler batch download Pcgame picture URL Save as XML implementation code

Multithreaded crawler batch download Pcgame picture URL Save as XML implementation code _python

Last Update:2017-01-18 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Copy Code code as follows:

#coding =GBK
From Xml.dom import Minidom,node
Import Urllib2,re,os
def readsrc (SRC):
Try
url = urllib2.urlopen (src)
Content = Url.read () #.decode (' Utf-8 ')
return content
Except
print ' ERROR '
Return None
def pictype (content):
'''
Get the picture type of the site by crawling the site navigation bar
Returns the list, each list element is a dictionary, addr represents the link for the picture type, name represents the picture type
Error will return none
'''
p = re.compile (R ' <ul> (. *) </ul> ', re. S
R=p.search (content)
If R:
Content=r.group ()
Else
Print None
p = re.compile (R ' <li\s*.*?>\s*<a href *= * "(?) P<addr>.*?) " > (? P<name>.*?) \s*</a>\s*</li> ')

L = [I.groupdict () for I in P.finditer (content)]
L=l[1:]
If Len (l): return L
Else:return None
def pageinfo (SRC):
'''
Get more information for a page
Returns a list of dictionaries for
Name: Picture names
CUTADDR: Shrinking Browse Map
PICADDR: The address of the actual picture
'''
D=os.path.split (SRC) [0]
Try
url = urllib2.urlopen (src)
Content = Url.read () #.decode (' Utf-8 ')
Except
print ' ERROR '
Return None
#find the pictures info in a page
p = re.compile (R ' <ul.*?> (. *?) </ul> ', Re. S
r = P.findall (content)
If not R:return None
r = r[1]
p = re.compile (R ' <li><a href= "(?) P<picaddr>.*?) ". *?>.*?) " *src= "(? P<cutaddr>.*?) " */></a>.*?</li> ')
L = [I.groupdict () for I in P.finditer (R)]
For I in L:
i[' picaddr ']=d+ '/' +i[' picaddr ']
If Len (l): return L
Else:return None

def nextpageaddr (SRC):
'''
Gets the name of the next page address from the HTML source of the page, and the last page returns none
'''
CONTENT=READSRC (SRC)
p = re.compile (R ' <a class= "Next" href= "(. *?)" >.*?</a> ')
r = P.search (content)
If R:
return Os.path.dirname (SRC) + "/" +r.group (1)
Else
Return None
def picinfoaddr (SRC):
'''
HTML code for a parameter album Atlas
Returns the relative address of all pictures
'''
CONTENT=READSRC (SRC)
p = re.compile (R ' <div class= "picinfo" >.*?<a href= "(?) P<addr>.*?) ". *?>.*?</div> ', Re. S
r = P.search (content)
If R:
return Os.path.dirname (SRC) + "/" +r.group (1)
Else
Return None
def parseinfo (content):
'''
Read all the pictures HTML code, get the details of an album
KW: Key Words
Title: Titles
Type: Types
Pic: The address list of each picture, the end plus _220x165,_medium,_small can get pictures of different sizes
'''
info={}
Temp=str ()

#title
Temp= '
R=re.search (' If R:
temp = R.group (1)
info[' title ']=temp

#keyword
Temp= '
R=re.search (' <meta name= "keywords" content= "(. *?)"/> ", content)
If R:
temp = R.group (1)
info[' kw ']=temp

#type
R=re.findall (' <i><a.*?> (. *?) </a></i>.*?> ", content)
If R:
info[' type ']= ': '. Join (R)
Else
info[' type ']= '
R=re.search (' <ul class= ". *?) > (. *?) </ul> ', Content,re. S
If not R:return None
Content=r.group (1) #filter content
# Print Content
R=re.findall (' <a href= ". *? ', content)

For Index,i in Enumerate (R):
R[index]=i[0:i.rfind ('_')]
# print R[index]
info[' pic ']=r
return info
Import threading
Class Mthread (threading. Thread):
def __init__ (Self,tp,addr,lock):
Threading. Thread.__init__ (self)
# Self.doc = Minidom. Document ()
Self.doc=minidom. Document ()
Self.tp=tp
Self.lock=lock
Self.addr=addr
Self.thread_stop=false
Self.picdoc=none
def run (self):
Self.picdoc = self.doc.createElement (' Urlclass ')
# Print SELF.TP
Self.picdoc.setAttribute (' type ', SELF.TP)
# Self.doc.appendChild (Self.picdoc)
M=pageinfo (SELF.ADDR)
While SELF.ADDR:
For I in M:
# print i[' picaddr ']
Picaddr=picinfoaddr (i[' picaddr ')
# Print Picaddr
Info=parseinfo (READSRC (PICADDR))
name=info[' title ']

Picture=doc.createelement (' picture ')

title = Doc.createelement (' title ')
Title.appendchild (Doc.createtextnode (info[' title '))
Picture.appendchild (title)

Keyword = doc.createelement (' keywords ')
Keyword.appendchild (Doc.createtextnode (info[' kw '))
Picture.appendchild (keyword)

TP = doc.createelement (' Pictype ')
Tp.appendchild (Doc.createtextnode (info[' type '))
Picture.appendchild (TP)

Cuturl = doc.createelement (' piccut ')
Cuturl.appendchild (Doc.createtextnode (i[' cutaddr '))
Picture.appendchild (Cuturl)

URLs = doc.createelement (' URLs ')
Self.lock.acquire ()
print ' downloading ', name
Self.lock.release ()
For picurl in info[' pic ']:
Singleurl=doc.createelement (' url ')
Singleurl.appendchild (Doc.createtextnode (picurl+ '. jpg '))
Urls.appendchild (Singleurl)

Picture.appendchild (URLs)
Self.picdoc.appendChild (picture)
M=pageinfo (SELF.ADDR)
SELF.ADDR=NEXTPAGEADDR (SELF.ADDR)
# f = open (' c:\\ ' +self.tp+ '. xml ', ' W ')
# F.write (doc.toprettyxml (indent = '))
# F.close ()
def stop (self):
Self.thread_stop=true

Path= ' c:\\pict\\ ' #下载的路径
#import SYS
Sys.exit (12)
CONTENT=READSRC (' http://photos.pcgames.com.cn/cate/3/1.html ')
R=pictype (content)
Lt=[]
Doc = Minidom. Document ()
Root=doc.createelement (' Url_resource ')
Root.setattribute (' type ', ' URL ')
Root.setattribute (' urltype ', ' image ')
Root.setattribute (' imgfmt ', ' jpg ')
Doc.appendchild (Root)
Lock=threading. Rlock ()
For iaddr in R:
print ' Downloading type: ', iaddr[' name '
addr=iaddr[' addr ']
Th=mthread (iaddr[' name '],addr,lock)
Lt.append (TH)
Th.start ()
For T in LT:
T.join ()
Root.appendchild (T.picdoc)

print ' Write '
f = open (' c:\\ ' + ' URLs ' + '. Xml ', ' W ')
F.write (doc.toprettyxml (indent = '))
F.close ()
Print Doc.toprettyxml ()
print ' End '

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Multithreaded crawler batch download Pcgame picture URL Save as XML implementation code _python

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Multithreaded crawler batch download Pcgame picture URL Save as XML implementation code _python

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support