# coding = Utf-8
__autor__ = ' Litao '
From selenium import Webdriver
From selenium.webdriver.common.by Import by
From Selenium.webdriver.support.ui import webdriverwait
From Selenium.webdriver.support import expected_conditions as EC
Import re, time, random
Import Selenium.common.exceptions
From selenium.webdriver.common.desired_capabilities import desiredcapabilities
From BS4 import BeautifulSoup
From Selenium.webdriver.common.action_chains import Actionchains
From Pymongo import mongoclient
Dcap = Dict (DESIREDCAPABILITIES.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/55.0.2883.103 safari/537.36 "
) # Modify the UserAgent property of the Liu Browser request header
Service_args = ['--load-images=false ', '--disk-cache=true '] # set browser to no graph and disk cache mode
Brower = Webdriver. Phantomjs (Service_args=service_args, Desired_capabilities=dcap)
Brower.set_window_size (1920, 1080) # sets the corresponding resolution to prevent the browser from finding the corresponding element when performing a click operation
wait = webdriverwait (brower, 10) # Set maximum wait time
Brower.get (url= "https://www.taobao.com")
def search (retry_times):
# wait = webdriverwait (brower, ten) #设定最大等待时间
# Brower.get (url= "https://www.taobao.com")
Try
Input_content = Wait.until (ec.presence_of_element_located ((By.css_selector, "#q")))
Search_botton = Wait.until (ec.presence_of_element_located ((By.css_selector, ". Btn-search"))
Input_content.send_keys ("Food")
Search_botton.click ()
Totle = Wait.until (ec.presence_of_element_located ((By.css_selector, ". Total"))
Print ("0k")
Print ("1")
Pase_page (1)
Return Totle.text
Except Selenium.common.exceptions.TimeoutException as E:
Print (e)
If Retry_times > 0:
Retry_times-= 1
Return Search (retry_times) # used to wait for a time-out to re-crawl
Return None
def next_page (Page_number, Retry_times):
Try
Input_content = Wait.until (
Ec.presence_of_element_located (By.css_selector, "#mainsrp-pager > div > div > div > Div.form > Input"))
Search_botton = Wait.until (ec.presence_of_element_located (
(By.css_selector, "#mainsrp-pager > div > div > div > Div.form > Span.btn.J_Submit")))
Input_content.clear ()
Input_content.send_keys (Page_number)
Search_botton.click ()
Wait.until (Ec.text_to_be_present_in_element (
(By.css_selector, ' #mainsrp-pager > div > div > div > ul > li.item.active > Span '), str (page_number)))
Print (str (page_number))
Pase_page (Page_number)
Except Selenium.common.exceptions.TimeoutException as E:
Print (e)
If Retry_times > 0:
Retry_times-= 1
Return Next_page (Page_number, retry_times) # used to wait for a time-out to re-crawl
def pase_page (Page_number):
if Page_number = = 1:
List2 = Wait.until (ec.presence_of_element_located (
(By.css_selector, "#J_itemlistPersonality > Div:nth-child (1) > Div:nth-child (1)"))
Actionchains (Brower). Move_to_element (List2). Perform ()
Wait.until (ec.presence_of_element_located ((By.css_selector, "#J_itemlistCont")))
HTML = Brower.page_source
# print (html+ "\ n" + "\ n" + "\ n" + "\ n")
html = Html.replace ("Item J_mouseronverreq", "Item_j_mouseronverreq")
Soup = beautifulsoup (html, ' Html.parser ')
Content = Soup.find_all (' div ', attrs={"class": "Item_j_mouseronverreq"})
Print (len (content))
For item in content:
result = {
"Image": Item.find (' img '). Get (' data-src '),
' Price ': Item.find (class_= "price"). Text.strip (),
"Deal": Item.find (class_= "deal-cnt"). Text.strip () [:-3],
"title": Re.sub (' \s ', ' ", Item.find (class_=" title "). Text.strip ()),
"Shop": Re.sub (' \s ', ' ", Item.find (class_=" Shop "). Text.strip ()),
"Location": re.sub (' \s ', ' ", Item.find (class_=" Location "). Text.strip ())
}
Print (Result)
Save_to_mongodb (Result)
Print ("**********************************************************************************************")
def save_to_mongodb (product):
Client = mongoclient (' 127.0.0.1 ', 27017)
db = Client.taobao
db["Taobao_meishi"].insert (product)
def main ():
Try
result = Search (2)
If result:
count_page = Int (Re.search ('. *? \d+). * ', result). Group (1))
For I in range (2, Count_page + 1):
Time.sleep (Random.randint (1, 3))
Print ("-----", I)
Next_page (i, 2)
Except Exception as E:
Print ("An error occurred during program run, the specific error is as follows:" + ' \ n ', e)
Finally
Brower.close () # The use of the try-except-finally structure for exceptions caused by any of these factors will close the program browser before the program exits
if __name__ = = "__main__":
Main ()
Selenuim&phantomjs&beautifulsoup Practice Classic Examples