1 ImportRequests2 fromlxmlImportetree3 fromBs4ImportBeautifulSoup4 ImportOS5 fromSeleniumImportWebdriver6 7 8 9 #parse each comic page and download comicsTen defmanhua (URL): One A - browser.get (URL) - the #get the page source of the simulated access -Html=Browser.page_source - - +HTML =etree. HTML (HTML) -Img_url = Html.xpath ('//img[@id = "Mangafile"]/@src') [0] +alt = Html.xpath ('/html/body/div[2]/div[2]/h1/a/text ()') [0] Atitle = Html.xpath ('/html/body/div[2]/div[2]/h2/text ()') [0] at Print(Img_url,alt,title) - - - #print (HTML) - - inPath='./Comics/'+alt+'/'+title+'/' - if notos.path.exists (path): to os.makedirs (path) +Fname=img_url.split ('/') [-1] - #print (fname) the * $ Print(Os.path.join (path,fname))Panax Notoginseng - #Request.urlretrieve (Img_url,os.path.join (path,fname)) the + #Request Picture Address AResponse =requests.get (Img_url) the #Binary decoding +Data=response.content - #Save File $With open (Path+fname,'WB') as F: $ f.write (data) - #Parse Get comic page link - defmanhua_url (URL): theResponse =requests.get (URL) -Response.encoding =response.apparent_encodingWuyiHTML =Response.text theHTML =etree. HTML (HTML) - #print (HTML) Wu #I for comic pages -i = Html.xpath ('/html/body/div[2]/div[2]/span/text ()') [1][1:-1] AboutI=Int (i) $ #print (i) - #find the rule of pagination - #Splicing page links, choosing the Format function -url = url +'/index.html?p={}' A #print (URL) + forNinchRange (1,i+1): theFullUrl =Url.format (n) - Print(FullUrl) $ #Time.sleep (2) the #FullUrl for all pagination comics links the manhua (FullUrl) the the #Parse List page - deflist (Lb_url): inResponse =requests.get (Lb_url) theResponse.encoding =response.apparent_encoding theHTML =Response.text AboutHTML = beautifulsoup (HTML,'lxml') the #match all chapter links theUrl_list = Html.select ('Div.subbooklist ul Li') the forUrlinchurl_list: +url = url.select ('a') [0].get ('href'). Split ('/') [-2] - the #print (URL)BayiFullUrl =Os.path.join (Lb_url,url) the Print(FullUrl) the #Chapter Links - Manhua_url (FullUrl) - the #print (url_list) the #print (HTML) the the #Parse Home - defShouye (): the #Home Link theBase_url ='http://www.omanhua.com/' the #initiating a request94Response =requests.get (Base_url) the #decoding theResponse.encoding =response.apparent_encoding the #Gets the page returned98HTML =Response.text About #print (HTML) - #parsing101HTML =beautifulsoup (HTML,'lxml')102 #Match Hottest Comics links103Url_list = Html.select ('Ul#cartoon_image_show1 Li')104 forUrlinchurl_list: the #print (URL)106url = url.select ('a') [0].get ('href') [1:]107 #alt = Url.select (' a ')108 #Print (ALT)109 #Splicing Links theFullUrl =Os.path.join (Base_url,url)111 Print(FullUrl) the 113 list (fullurl) the if __name__=='__main__': the #use the automatic test module selenium to simulate the browser access, here with Google image loading get no image link the #The following path is the Chorm drive path117Browser = Webdriver. Chrome (executable_path=r'C:\Users\zhaozhi\Desktop\chromedriver.exe')118Shouye ()
The first self-study crawler soon, the code may be a bit cumbersome to write, I hope to work with you to learn progress
Python Crawl Oh comics