#coding =GBK
From Xml.dom import Minidom,node
Import Urllib2,re,os
def readsrc (SRC):
Try
url = urllib2.urlopen (src)
Content = Url.read () #.decode (' Utf-8 ')
return content
Except
print ' ERROR '
Return None
def pictype (content):
'''
Get the picture type of the site by crawling the site navigation bar
Returns the list, each list element is a dictionary, addr represents the link for the picture type, name represents the picture type
Error will return none
'''
p = re.compile (R ' <ul> (. *) </ul> ', re. S
R=p.search (content)
If R:
Content=r.group ()
Else
Print None
p = re.compile (R ' <li\s*.*?>\s*<a href *= * "(?) P<addr>.*?) " > (? P<name>.*?) \s*</a>\s*</li> ')
L = [I.groupdict () for I in P.finditer (content)]
L=l[1:]
If Len (l): return L
Else:return None
def pageinfo (SRC):
'''
Get more information for a page
Returns a list of dictionaries for
Name: Picture names
CUTADDR: Shrinking Browse Map
PICADDR: The address of the actual picture
'''
D=os.path.split (SRC) [0]
Try
url = urllib2.urlopen (src)
Content = Url.read () #.decode (' Utf-8 ')
Except
print ' ERROR '
Return None
#find the pictures info in a page
p = re.compile (R ' <ul.*?> (. *?) </ul> ', Re. S
r = P.findall (content)
If not R:return None
r = r[1]
p = re.compile (R ' <li><a href= "(?) P<picaddr>.*?) ". *?>.*?) " *src= "(? P<cutaddr>.*?) " */></a>.*?</li> ')
L = [I.groupdict () for I in P.finditer (R)]
For I in L:
i[' picaddr ']=d+ '/' +i[' picaddr ']
If Len (l): return L
Else:return None
def nextpageaddr (SRC):
'''
Gets the name of the next page address from the HTML source of the page, and the last page returns none
'''
CONTENT=READSRC (SRC)
p = re.compile (R ' <a class= "Next" href= "(. *?)" >.*?</a> ')
r = P.search (content)
If R:
return Os.path.dirname (SRC) + "/" +r.group (1)
Else
Return None
def picinfoaddr (SRC):
'''
HTML code for a parameter album Atlas
Returns the relative address of all pictures
'''
CONTENT=READSRC (SRC)
p = re.compile (R ' <div class= "picinfo" >.*?<a href= "(?) P<addr>.*?) ". *?>.*?</div> ', Re. S
r = P.search (content)
If R:
return Os.path.dirname (SRC) + "/" +r.group (1)
Else
Return None
def parseinfo (content):
'''
Read all the pictures HTML code, get the details of an album
KW: Key Words
Title: Titles
Type: Types
Pic: The address list of each picture, the end plus _220x165,_medium,_small can get pictures of different sizes
'''
info={}
Temp=str ()
#title
Temp= '
R=re.search (' If R:
temp = R.group (1)
info[' title ']=temp
#keyword
Temp= '
R=re.search (' <meta name= "keywords" content= "(. *?)"/> ", content)
If R:
temp = R.group (1)
info[' kw ']=temp
#type
R=re.findall (' <i><a.*?> (. *?) </a></i>.*?> ", content)
If R:
info[' type ']= ': '. Join (R)
Else
info[' type ']= '
R=re.search (' <ul class= ". *?) > (. *?) </ul> ', Content,re. S
If not R:return None
Content=r.group (1) #filter content
# Print Content
R=re.findall (' <a href= ". *? ', content)
For Index,i in Enumerate (R):
R[index]=i[0:i.rfind ('_')]
# print R[index]
info[' pic ']=r
return info
Import threading
Class Mthread (threading. Thread):
def __init__ (Self,tp,addr,lock):
Threading. Thread.__init__ (self)
# Self.doc = Minidom. Document ()
Self.doc=minidom. Document ()
Self.tp=tp
Self.lock=lock
Self.addr=addr
Self.thread_stop=false
Self.picdoc=none
def run (self):
Self.picdoc = self.doc.createElement (' Urlclass ')
# Print SELF.TP
Self.picdoc.setAttribute (' type ', SELF.TP)
# Self.doc.appendChild (Self.picdoc)
M=pageinfo (SELF.ADDR)
While SELF.ADDR:
For I in M:
# print i[' picaddr ']
Picaddr=picinfoaddr (i[' picaddr ')
# Print Picaddr
Info=parseinfo (READSRC (PICADDR))
name=info[' title ']
Picture=doc.createelement (' picture ')
title = Doc.createelement (' title ')
Title.appendchild (Doc.createtextnode (info[' title '))
Picture.appendchild (title)
Keyword = doc.createelement (' keywords ')
Keyword.appendchild (Doc.createtextnode (info[' kw '))
Picture.appendchild (keyword)
TP = doc.createelement (' Pictype ')
Tp.appendchild (Doc.createtextnode (info[' type '))
Picture.appendchild (TP)
Cuturl = doc.createelement (' piccut ')
Cuturl.appendchild (Doc.createtextnode (i[' cutaddr '))
Picture.appendchild (Cuturl)
URLs = doc.createelement (' URLs ')
Self.lock.acquire ()
print ' downloading ', name
Self.lock.release ()
For picurl in info[' pic ']:
Singleurl=doc.createelement (' url ')
Singleurl.appendchild (Doc.createtextnode (picurl+ '. jpg '))
Urls.appendchild (Singleurl)
Picture.appendchild (URLs)
Self.picdoc.appendChild (picture)
M=pageinfo (SELF.ADDR)
SELF.ADDR=NEXTPAGEADDR (SELF.ADDR)
# f = open (' c:\\ ' +self.tp+ '. xml ', ' W ')
# F.write (doc.toprettyxml (indent = '))
# F.close ()
def stop (self):
Self.thread_stop=true
Path= ' c:\\pict\\ ' #下载的路径
#import SYS
Sys.exit (12)
CONTENT=READSRC (' http://photos.pcgames.com.cn/cate/3/1.html ')
R=pictype (content)
Lt=[]
Doc = Minidom. Document ()
Root=doc.createelement (' Url_resource ')
Root.setattribute (' type ', ' URL ')
Root.setattribute (' urltype ', ' image ')
Root.setattribute (' imgfmt ', ' jpg ')
Doc.appendchild (Root)
Lock=threading. Rlock ()
For iaddr in R:
print ' Downloading type: ', iaddr[' name '
addr=iaddr[' addr ']
Th=mthread (iaddr[' name '],addr,lock)
Lt.append (TH)
Th.start ()
For T in LT:
T.join ()
Root.appendchild (T.picdoc)
print ' Write '
f = open (' c:\\ ' + ' URLs ' + '. Xml ', ' W ')
F.write (doc.toprettyxml (indent = '))
F.close ()
Print Doc.toprettyxml ()
print ' End '