Description
1. A book site crawler, now only realize the pick a page, already two o'clock in the morning, and hungry and sleepy, first sleep, and then write a summary tomorrow!
2.
1 Importurllib.request2 ImportRe3 ImportOS4 5 #get manga Web home HTML6URL ="http://www.yaoqmh.net/shaonvmanhua/list_4_1.html"7headers = {'user-agent':'mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) gecko/20100101 firefox/23.0'}8req = Urllib.request.Request (url=url,headers=headers)9Response =urllib.request.urlopen (URL)Tenhtml = Response.read (). Decode ("Utf-8") One #deal with HTML, keep only the middle of the book, the side and the top of the book do not AStartnum = Html.find ("Mainleft") -Endnum = Html.find ("Mainright") -HTML =Html[startnum:endnum] the - - #get the book number from HTML, name - #<a href= "/shaonvmanhua/8389.html" class= "pic Show" title= "the relationship between the manga and the teen Comics" target= "_blank" ><span class= "BT "> H Teen Comics relationship </span> <span class=" bg "></span></a> + # - # +Regbooknum = R'href= "/shaonvmanhua/(\d+) \.html"' ARegname = R'title= "(. +?)"' atBooknums =Re.findall (regbooknum, HTML) -Booknames =Re.findall (regname, HTML) - #print (booknums) - #print (booknames) - - #open each book page, get the total number of pages, the first image URL in # - forIinchRange (len (booknums)): toUrlbook ="http://www.yaoqmh.net/shaonvmanhua/"+booknums[i]+". html" +Reqbook = Urllib.request.Request (url=urlbook,headers=headers) -Responsebook =Urllib.request.urlopen (Reqbook) theHtmlbook = Responsebook.read (). Decode ("Utf-8") *Regpagenums = R"Total (\d+) page:" $RegImgStart1 = R"http://pic\.taov5\.com/1/(\d+)/\d+?\.jpg"Panax NotoginsengRegImgStart2 = R"http://pic\.taov5\.com/1/\d+?/(\d+?) \.jpg" -Pagenums = Re.findall (Regpagenums,htmlbook)#total pages, get a two-dimensional array with two total page labels theImgStart1 = Re.findall (RegImgStart1, Htmlbook)#the first number of the picture directory, FindAll returns an array +ImgStart2 = Re.findall (RegImgStart2, Htmlbook)#second number of picture catalogs A #Create a new folder for each notebook, download a book to return to the top level directory!! Otherwise you will always create a new subfolder! theOs.mkdir (Booknames[i])#New Folder +Os.chdir (Booknames[i])#jump to the specified directory - #remember to return to the Superior directory!! $ $ #Start and end page numbers -RangeMin =Int (imgstart2[0]) -RangeMax = Int (imgstart2[0]) +Int (pagenums[0]) thePagenums =Int (pagenums[0]) - #print (rangemin)Wuyi #print (RangeMax) the #print (Type (rangemin)) - #open each page and download the folder that you saved to the name. Wu Print("currently downloading:"+booknames[i])#give the download a hint book name - forJinchRange (pagenums): AboutUrlimg ="http://pic.taov5.com/1/"+imgstart1[0]+"/"+str (RANGEMIN+J) +". jpg" $reqimg = Urllib.request.Request (url=urlimg,headers=headers) -Responseimg =Urllib.request.urlopen (reqimg) -img = open (str (j) +". jpg","WB") - Img.write (Responseimg.read ()) A img.close () + Print("downloaded%d pages, total%d pages"% (j+1,pagenums))#tips to download a few pages, put it in the back better the #Os.system ("pause") -Os.chdir (Os.path.dirname (OS.GETCWD ()))#return to parent directory $ #exit Function, download which page, Python button stops running
Python Learning Notes (11)--Crawler Download Comics pictures