#!/usr/bin/env python#Coding=utf-8
# by Chuxing 2014/10/1
# qq:121866673
fromOs.pathImportdirname, Abspath fromExtractImportExtract,extract_allImportUrllib2#Crawl Search PageMainurl ='http://desk.**.com.cn'Hosturl='http://desk.**.com.cn/pc/'" "Loop page List ==================== crawl Theme page address return to topic page address List" "hreflist= []defspider1 (): forPageinchRange (0,236):#236 #page Address law, the first topic page is incremented by an array to 236 pagesI_pageurl = Hosturl+str (page+1) +'. html'I_urlopen=Urllib2.urlopen (i_pageurl) i_readhtml=I_urlopen.read ()Print 'Main:', I_pageurl,len (i_readhtml) I_htmldata= Extract_all ('<ul class= "Pic-list2 clearfix" >','</ul>', i_readhtml)#Print NewDataI_htmldata = Extract_all ('href= "','"', str (i_htmldata))#Print NewData forDinchI_htmldata:i_pageurl= mainurl+d hreflist.append (I_pageurl)#print ' imgpage: ', I_pageurl Print 'Imgpagecount:', Len (hreflist)" "crawl the page address of each picture in the theme to return to the Picture page address list" "Contentpage= []defspider2 (): forCpinchhreflist:Try: I_urlopen=Urllib2.urlopen (CP) i_readhtml=I_urlopen.read ()Print 'Main:', Cp,len (i_readhtml) I_htmldata= Extract_all ('<div class= "Photo-list-box" >','</ul>', i_readhtml)#Print I_htmldataI_htmldata = Extract_all ('href= "','"', str (i_htmldata))#print ' I_htmldata: ' +str (i_htmldata) forIinchI_htmldata:i_pageurl= mainurl+i contentpage.append (i_pageurl)#print ' imgpage: ', I_pageurl except: Pass" "capture the page address of the picture's maximum resolution in each picture page return to the picture page list" "Imgpage= []defSpider3 (): forIpinchContentpage:Try: I_urlopen=urllib2.urlopen (IP) i_readhtml=I_urlopen.read ()#print ' main: ', Ip,len (i_readhtml)I_htmldata = Extract_all ('<dd id= "TAGFBL" >','</dd>', i_readhtml)#Print I_htmldataI_htmldata = Extract_all ('href= "','"', str (i_htmldata))#print ' I_htmldata: ' +str (i_htmldata)I_pageurl = Mainurl +I_htmldata[0] Imgpage.append (i_pageurl)Print 'Imgpage:', I_pageurl#For i in I_htmldata: #I_pageurl = Mainurl+i #contentpage.append (I_pageurl) #print ' imgpage: ', I_pageurl except: Pass" "grab a picture in the list of pictures address construct list return picture address list" "Imgurl= []defspider4 (): forImginchImgpage:Try: I_urlopen=Urllib2.urlopen (img) i_readhtml=I_urlopen.read ()#print ' main: ', Ip,len (i_readhtml)I_htmldata = Extract_all ('','"', i_readhtml)#Print I_htmldataimgurl.append (i_htmldata[0])PrintI_htmldata[0]except: Pass#folder path where the program residesPREFIX = DirName (Abspath (__file__)) Spider1 () Spider2 () Spider3 () spider4 ()#generate BAT file, requires wget component supportWith open ("%s\pic\downpic.bat"%prefix,"W") as down: forNinchRange (0,len (imgurl)): Data='wget%s-o "%s\pic\%s.jpg" \ n'%(IMGURL[N],PREFIX,STR (n)) down.write (Data
Extract library file:
#!/usr/bin/env python#Coding=utf-8" "takes out the string between all begin and end and returns it as a list. " "defextract (begin, end, HTML):if notHTML:return "'Start=html.find (BEGIN)ifStart >=0:start+=len (BEGIN)ifEnd is notNone:end=html.find (end, start)ifEnd isNoneorEnd >=0:returnHtml[start:end].strip ()defExtract_all (begin, end, HTML):returnmap (Str.strip, _extract_all (begin, end, HTML))def_extract_all (begin, end, HTML):if notHTML:return "'result=[] From_pos=0 whileTrue:start=Html.find (begin, From_pos)ifStart >=0:start+=len (begin) Endpos=html.find (end, start)ifEndpos >=0:result.append (Html[start:endpos]) From_pos= endpos+Len (end)Continue Break returnResult
wget components are required.
Results:
Disclaimer: The code is for research purposes only and the author is not responsible for the consequences of misuse of this code.
Python crawler _ a desktop wallpaper site all pictures