Just climb the picture of my favorite piece of the Pirate King.
Need to set up a Imgcache directory under D disk
#-*-Coding:utf-8-*-import urllibimport urllib2import jsonfrom BS4 import beautifulsoupimport threadpool import THREADC Lass Htmlpaser:def __init__ (self): self.url= ' http://1.hzfans.sinaapp.com/process.php ' #POST数 According to the interface Def Post (self,postdata): # headers = {# ' user-agent ': ' Mozilla/5.0 (wind oWS U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 ' #} # data = Urllib.urlencode (postdata) # req = Urllib2. Request (self.url,data,headers) # resp = Urllib2.urlopen (req,none,20) # html = Resp.read () # return HTML data = Urllib.urlencode (postdata) req = urllib2. Request (URL, data) html= Urllib2.urlopen (req). Read () print HTML #获取html内容 def Gethtml (self,url): headers = {' user-agent ': ' mozilla/5.0 (Windows; U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '} req = Urllib2. Request (url,none,headers) resp = Urllib2.urlopen (req,none,5) HTML = Resp.read () #return html.decode (' UTF8 ') return HTML def GetHtml2 (self,url): page = Urllib.url Open (URL) html = Page.read () page.close () return HTML def GETHTML3 ( Self,url): Req_header = {' user-agent ': ' mozilla/5.0 (Windows NT 6.1) applewebkit/537.11 (khtml, like Gecko) chrome/23.0.1271.64 safari/537.11 ', ' Accept ': ' text/html;q=0.9,*/*;q=0.8 ', ' accept-charset ': ' iso-8859-1,utf-8;q=0.7,*;q=0.3 ', ' accept-encoding ': ' gzip ', ' Connection ': ' Close ', ' Referer ': None #注意假设依旧不能抓取的话, here can set the crawl site host} req_timeout = 5 req = Urll Ib2. Request (Url,none,req_header) RESP = Urllib2.urlopen (req,none,req_timeout) HTML = resp.read () return HTML def GetList (self,html): Soup = BeautifulSoup (". Join (HTML)) Baseitem=soup.find (' ul ', {' C Lass ': ' List '}) slist=baseitem.select (' Li a ') return slist def downimg (Self,imgurl): path= r "d:/imgcache/" +self.ggetfilename (imgurl) data = Urllib.urlretrieve (Imgurl,path) Return Data def ggetfilename (self,url): If Url==none:return None if url== "" : Return "" Arr=url.split ("/") return Arr[len (arr)-1] def mkdir (path): Import OS Path=path.strip () path=path.rstrip ("\ \") # infers if the path exists # exists True # does not exist False isexists=os.path.exists (path) # inferred result If not isexists: # Create a folder if it doesn't exist # Create folder action function Os.makedirs (PATH) return True Else: # assumes that the folder exists and does not create and prompts that the folder already exists return False #返回两个值 def parsecontent (self,html): Soup = BeautifulSoup (". Join (HTML)) Baseitem=soup.find (' div ', {' class ': ' Showbox '}) title=soup.find (' div ', {' class ': ' msg '}). Find (' div ', {' class ': ' m _left '}). Get_text () imglist=baseitem.find_all (' img ') for IMG in imglist: Imgurl=img.get (' src ') self. Downimg (Imgurl) Content=baseitem.get_text () encode (' UTF8 ') position=content.find (' Hot recommendation ') return title,content[0:position] def parseitem (self,item): Url=item.get (' href ') If Url==none:return #print url+ ' \ n ' Html=obj. GETHTML2 (URL) title,content=obj. Parsecontent (HTML) #print title+ ' \ n ' return titledef print_result (request, result): P Rint Str (Request.requestid) + ":" +result obj=htmlpaser () pool = ThreadPool. ThreadPool (Ten) for I in Range (1,40): url= "http://op.52pk.com/shtml/op_wz/list_2594_%d.shtml"% (i) html=obj. GETHTML2 (URL) items=obj. GetList (HTML) print ' Add Job%d\r '% (i) requests = threadpool.makerequests (obj. Parseitem, items, Print_result) [Pool.putrequest (req) for req in requests] pool.wait ()
Python-Implemented download op pirate Wang Web pictures (web crawler)