1 #Coding:utf-82 ImportUrllib23 ImportRe4 ImportThreading5 6 #image Download7 defloadimg (addr,x,y,artname):8data =urllib2.urlopen (addr). Read ()9f = open (Artname.decode ("Utf-8") +str (y) +'. jpg','WB')Ten f.write (data) One f.close () A - #specific Post page resolution, get the image link address, and use loadimg download artname for the post name - defGetimglink (html,x,artname): theRelink =' " alt= ". *.jpg"/>' -Cinfo =Re.findall (relink,html) -y =0 - forLininchCinfo: +IMGADDR ='http://www.xxx.com/'+Lin - Print "loadimg:"+STR (x), imgaddr+'\ n' +t = Threading. Thread (Target=loadimg (imgaddr,x,y,artname))#multi-threaded download with threading A T.start () aty = y+1 - - #Forum Section page parsing, get specific posts link - defGetarticlelink (html,page): -Relink ='<a href= "(viewthread\.php\?tid=.*3d.*)" > (. *) </a>' -Cinfo =Re.findall (relink,html) inx = 1 - forLininchCinfo: to #print Lin, ' \ n ' +Url="http://www.xxx.com/"+Lin[0] -headers={"user-agent":"mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"} thereq = Urllib2. Request (url,headers=headers) *Response=Urllib2.urlopen (req) $HTML =Response.read ()Panax NotoginsengGetimglink (html,x,lin[1]) -x = x+1 the +Start = 1#Start Page AEnd = 100#End Page the forPageinchRange (end): +Url="http://www.xxx.com/forumdisplay.php?fid=19&page="+str (page+start) -headers={"user-agent":"mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"} $req = Urllib2. Request (url,headers=headers) $Response=Urllib2.urlopen (req) -HTML =Response.read () - Print'Start' theGetarticlelink (Html,page)
A simple Python web crawler (grab) for a forum.