Multithreaded crawler batch download Pcgame picture URL Save as XML implementation code _python

Source: Internet
Author: User

Copy Code code as follows:

#coding =GBK
From Xml.dom import Minidom,node
Import Urllib2,re,os
def readsrc (SRC):
Try
url = urllib2.urlopen (src)
Content = Url.read () #.decode (' Utf-8 ')
return content
Except
print ' ERROR '
Return None
def pictype (content):
'''
Get the picture type of the site by crawling the site navigation bar
Returns the list, each list element is a dictionary, addr represents the link for the picture type, name represents the picture type
Error will return none
'''
p = re.compile (R ' <ul> (. *) </ul> ', re. S
R=p.search (content)
If R:
Content=r.group ()
Else
Print None
p = re.compile (R ' <li\s*.*?>\s*<a href *= * "(?) P<addr>.*?) " > (? P<name>.*?) \s*</a>\s*</li> ')

L = [I.groupdict () for I in P.finditer (content)]
L=l[1:]
If Len (l): return L
Else:return None
def pageinfo (SRC):
'''
Get more information for a page
Returns a list of dictionaries for
Name: Picture names
CUTADDR: Shrinking Browse Map
PICADDR: The address of the actual picture
'''
D=os.path.split (SRC) [0]
Try
url = urllib2.urlopen (src)
Content = Url.read () #.decode (' Utf-8 ')
Except
print ' ERROR '
Return None
#find the pictures info in a page
p = re.compile (R ' <ul.*?> (. *?) </ul> ', Re. S
r = P.findall (content)
If not R:return None
r = r[1]
p = re.compile (R ' <li><a href= "(?) P<picaddr>.*?) ". *?>.*?) " *src= "(? P<cutaddr>.*?) " */></a>.*?</li> ')
L = [I.groupdict () for I in P.finditer (R)]
For I in L:
i[' picaddr ']=d+ '/' +i[' picaddr ']
If Len (l): return L
Else:return None

def nextpageaddr (SRC):
'''
Gets the name of the next page address from the HTML source of the page, and the last page returns none
'''
CONTENT=READSRC (SRC)
p = re.compile (R ' <a class= "Next" href= "(. *?)" >.*?</a> ')
r = P.search (content)
If R:
return Os.path.dirname (SRC) + "/" +r.group (1)
Else
Return None
def picinfoaddr (SRC):
'''
HTML code for a parameter album Atlas
Returns the relative address of all pictures
'''
CONTENT=READSRC (SRC)
p = re.compile (R ' <div class= "picinfo" >.*?<a href= "(?) P<addr>.*?) ". *?>.*?</div> ', Re. S
r = P.search (content)
If R:
return Os.path.dirname (SRC) + "/" +r.group (1)
Else
Return None
def parseinfo (content):
'''
Read all the pictures HTML code, get the details of an album
KW: Key Words
Title: Titles
Type: Types
Pic: The address list of each picture, the end plus _220x165,_medium,_small can get pictures of different sizes
'''
info={}
Temp=str ()

#title
Temp= '
R=re.search (' If R:
temp = R.group (1)
info[' title ']=temp

#keyword
Temp= '
R=re.search (' <meta name= "keywords" content= "(. *?)"/> ", content)
If R:
temp = R.group (1)
info[' kw ']=temp

#type
R=re.findall (' <i><a.*?> (. *?) </a></i>.*?> ", content)
If R:
info[' type ']= ': '. Join (R)
Else
info[' type ']= '
R=re.search (' <ul class= ". *?) > (. *?) </ul> ', Content,re. S
If not R:return None
Content=r.group (1) #filter content
# Print Content
R=re.findall (' <a href= ". *? ', content)

For Index,i in Enumerate (R):
R[index]=i[0:i.rfind ('_')]
# print R[index]
info[' pic ']=r
return info
Import threading
Class Mthread (threading. Thread):
def __init__ (Self,tp,addr,lock):
Threading. Thread.__init__ (self)
# Self.doc = Minidom. Document ()
Self.doc=minidom. Document ()
Self.tp=tp
Self.lock=lock
Self.addr=addr
Self.thread_stop=false
Self.picdoc=none
def run (self):
Self.picdoc = self.doc.createElement (' Urlclass ')
# Print SELF.TP
Self.picdoc.setAttribute (' type ', SELF.TP)
# Self.doc.appendChild (Self.picdoc)
M=pageinfo (SELF.ADDR)
While SELF.ADDR:
For I in M:
# print i[' picaddr ']
Picaddr=picinfoaddr (i[' picaddr ')
# Print Picaddr
Info=parseinfo (READSRC (PICADDR))
name=info[' title ']


Picture=doc.createelement (' picture ')

title = Doc.createelement (' title ')
Title.appendchild (Doc.createtextnode (info[' title '))
Picture.appendchild (title)

Keyword = doc.createelement (' keywords ')
Keyword.appendchild (Doc.createtextnode (info[' kw '))
Picture.appendchild (keyword)

TP = doc.createelement (' Pictype ')
Tp.appendchild (Doc.createtextnode (info[' type '))
Picture.appendchild (TP)

Cuturl = doc.createelement (' piccut ')
Cuturl.appendchild (Doc.createtextnode (i[' cutaddr '))
Picture.appendchild (Cuturl)

URLs = doc.createelement (' URLs ')
Self.lock.acquire ()
print ' downloading ', name
Self.lock.release ()
For picurl in info[' pic ']:
Singleurl=doc.createelement (' url ')
Singleurl.appendchild (Doc.createtextnode (picurl+ '. jpg '))
Urls.appendchild (Singleurl)

Picture.appendchild (URLs)
Self.picdoc.appendChild (picture)
M=pageinfo (SELF.ADDR)
SELF.ADDR=NEXTPAGEADDR (SELF.ADDR)
# f = open (' c:\\ ' +self.tp+ '. xml ', ' W ')
# F.write (doc.toprettyxml (indent = '))
# F.close ()
def stop (self):
Self.thread_stop=true


Path= ' c:\\pict\\ ' #下载的路径
#import SYS
Sys.exit (12)
CONTENT=READSRC (' http://photos.pcgames.com.cn/cate/3/1.html ')
R=pictype (content)
Lt=[]
Doc = Minidom. Document ()
Root=doc.createelement (' Url_resource ')
Root.setattribute (' type ', ' URL ')
Root.setattribute (' urltype ', ' image ')
Root.setattribute (' imgfmt ', ' jpg ')
Doc.appendchild (Root)
Lock=threading. Rlock ()
For iaddr in R:
print ' Downloading type: ', iaddr[' name '
addr=iaddr[' addr ']
Th=mthread (iaddr[' name '],addr,lock)
Lt.append (TH)
Th.start ()
For T in LT:
T.join ()
Root.appendchild (T.picdoc)

print ' Write '
f = open (' c:\\ ' + ' URLs ' + '. Xml ', ' W ')
F.write (doc.toprettyxml (indent = '))
F.close ()
Print Doc.toprettyxml ()
print ' End '

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.