1 #Coding:utf-82 """3 download fried egg to local, through selenium, regular expression, PHANTOMJS, BeautifulSoup realization4 """5 6 ImportRe7 ImportOS8 9 fromSeleniumImportWebdriverTen fromSelenium.webdriver.support.waitImportwebdriverwait One fromSelenium.webdriver.supportImportExpected_conditions as EC A fromSelenium.webdriver.common.byImport by - fromSelenium.common.exceptionsImporttimeoutexception - fromBs4ImportBeautifulSoup the fromUrllibImportUrlretrieve - - #solve Google Browser is under the control of automated testing software - #options = Webdriver. Chromeoptions () + #options.add_argument (' Disable-infobars ') - +URL ='Http://jandan.net/ooxx' A #Driver = Webdriver. Chrome (chrome_options=options) atDriver =Webdriver. PHANTOMJS () -wait = webdriverwait (driver, 30) - - #Download the folder of fried egg and sister saved -Img_save_file ='Images' - in #gets the total number of pages. Open Fried egg Net-sister figure the default page can get to the total number of pages - defget_default_page_num (): to Try: + driver.get (URL) -Page_element = Wait.until (ec.presence_of_element_located (By.css_selector,'. Current-comment-page'))) the returnPage_element.text * excepttimeoutexception: $ Get_default_page_num ()Panax Notoginseng - #get the URL of a picture the defGet_img_url (page_number): +Img_url_list = [] AURL = r'http://jandan.net/ooxx/page-'+ STR (page_number) + R'#comments' the PrintURL + #url = ' http://www.baidu.com ' -HTML =driver.get (URL) $ Try: $ driver.get (URL) -Wait.until (ec.presence_of_element_located (By.css_selector,'#comments > Ol img'))) - excepttimeoutexception: the Print "failed to open the page, reload the page" - Get_img_url (Page_number)Wuyi the #get page HTML element -HTML =Driver.page_source Wu #through BeautifulSoup parsing -Soup = BeautifulSoup (HTML,'Html.parser') About #Find all the tags for img $IMGs = Soup.find_all ('img') - #gif images need to get the Ora_src property, which is the full GIF image. Has_attr determine if there is a property, Attrs can get the property value - forImginchIMGs: - ifImg.has_attr ('org_src'): AImg_url = img.attrs['org_src'] + Else: theImg_url = img.attrs['src'] - img_url_list.append (Img_url) $ returnimg_url_list the the #download images via Urllib's Urlretrieve implementation the defdownload_img (img_url): theImg_name = Img_url.split ('/') [-1] -Img_save_path = Img_save_file +'/'+Img_name inUrlretrieve (Img_url, Img_save_file +'/'+img_name) the the #Create the folder where the picture is stored About defAdd_img_save_file (img_save_file): the ifos.path.exists (img_save_file): the Pass the Else: + os.makedirs (img_save_file) - the defMain ():Bayi add_img_save_file (img_save_file) the #extracting the current number of pages with regular expressions thePartner = Re.compile (r'(\d+)') -Content =Get_default_page_num () -Total_pages =partner.search (content). Group () the the forIinchRange (1, int (total_pages) + 1): the Print "You are downloading section"+ STR (i) +'the picture, URL is:', theImg_url_list =Get_img_url (str (i)) - forImg_urlinchimg_url_list: the download_img (Img_url) the the if __name__=='__main__':94Main ()
Python crawler--Download fried egg net sister map to Local