Selenuim&phantomjs&beautifulsoup Practice Classic Examples

Source: Internet
Author: User
Tags mongoclient dcap

# coding = Utf-8
__autor__ = ' Litao '

From selenium import Webdriver
From selenium.webdriver.common.by Import by
From Selenium.webdriver.support.ui import webdriverwait
From Selenium.webdriver.support import expected_conditions as EC
Import re, time, random
Import Selenium.common.exceptions
From selenium.webdriver.common.desired_capabilities import desiredcapabilities
From BS4 import BeautifulSoup
From Selenium.webdriver.common.action_chains import Actionchains
From Pymongo import mongoclient

Dcap = Dict (DESIREDCAPABILITIES.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/55.0.2883.103 safari/537.36 "
) # Modify the UserAgent property of the Liu Browser request header
Service_args = ['--load-images=false ', '--disk-cache=true '] # set browser to no graph and disk cache mode
Brower = Webdriver. Phantomjs (Service_args=service_args, Desired_capabilities=dcap)
Brower.set_window_size (1920, 1080) # sets the corresponding resolution to prevent the browser from finding the corresponding element when performing a click operation
wait = webdriverwait (brower, 10) # Set maximum wait time
Brower.get (url= "https://www.taobao.com")


def search (retry_times):
# wait = webdriverwait (brower, ten) #设定最大等待时间
# Brower.get (url= "https://www.taobao.com")
Try
Input_content = Wait.until (ec.presence_of_element_located ((By.css_selector, "#q")))
Search_botton = Wait.until (ec.presence_of_element_located ((By.css_selector, ". Btn-search"))
Input_content.send_keys ("Food")
Search_botton.click ()
Totle = Wait.until (ec.presence_of_element_located ((By.css_selector, ". Total"))
Print ("0k")
Print ("1")
Pase_page (1)
Return Totle.text
Except Selenium.common.exceptions.TimeoutException as E:
Print (e)
If Retry_times > 0:
Retry_times-= 1
Return Search (retry_times) # used to wait for a time-out to re-crawl
Return None


def next_page (Page_number, Retry_times):
Try
Input_content = Wait.until (
Ec.presence_of_element_located (By.css_selector, "#mainsrp-pager > div > div > div > Div.form > Input"))
Search_botton = Wait.until (ec.presence_of_element_located (
(By.css_selector, "#mainsrp-pager > div > div > div > Div.form > Span.btn.J_Submit")))
Input_content.clear ()
Input_content.send_keys (Page_number)
Search_botton.click ()
Wait.until (Ec.text_to_be_present_in_element (
(By.css_selector, ' #mainsrp-pager > div > div > div > ul > li.item.active > Span '), str (page_number)))
Print (str (page_number))
Pase_page (Page_number)
Except Selenium.common.exceptions.TimeoutException as E:
Print (e)
If Retry_times > 0:
Retry_times-= 1
Return Next_page (Page_number, retry_times) # used to wait for a time-out to re-crawl


def pase_page (Page_number):
if Page_number = = 1:
List2 = Wait.until (ec.presence_of_element_located (
(By.css_selector, "#J_itemlistPersonality > Div:nth-child (1) > Div:nth-child (1)"))
Actionchains (Brower). Move_to_element (List2). Perform ()
Wait.until (ec.presence_of_element_located ((By.css_selector, "#J_itemlistCont")))
HTML = Brower.page_source
# print (html+ "\ n" + "\ n" + "\ n" + "\ n")
html = Html.replace ("Item J_mouseronverreq", "Item_j_mouseronverreq")
Soup = beautifulsoup (html, ' Html.parser ')
Content = Soup.find_all (' div ', attrs={"class": "Item_j_mouseronverreq"})
Print (len (content))
For item in content:
result = {
"Image": Item.find (' img '). Get (' data-src '),
' Price ': Item.find (class_= "price"). Text.strip (),
"Deal": Item.find (class_= "deal-cnt"). Text.strip () [:-3],
"title": Re.sub (' \s ', ' ", Item.find (class_=" title "). Text.strip ()),
"Shop": Re.sub (' \s ', ' ", Item.find (class_=" Shop "). Text.strip ()),
"Location": re.sub (' \s ', ' ", Item.find (class_=" Location "). Text.strip ())
}
Print (Result)
Save_to_mongodb (Result)
Print ("**********************************************************************************************")


def save_to_mongodb (product):
Client = mongoclient (' 127.0.0.1 ', 27017)
db = Client.taobao
db["Taobao_meishi"].insert (product)


def main ():
Try
result = Search (2)
If result:
count_page = Int (Re.search ('. *? \d+). * ', result). Group (1))
For I in range (2, Count_page + 1):
Time.sleep (Random.randint (1, 3))
Print ("-----", I)
Next_page (i, 2)

Except Exception as E:
Print ("An error occurred during program run, the specific error is as follows:" + ' \ n ', e)
Finally
Brower.close () # The use of the try-except-finally structure for exceptions caused by any of these factors will close the program browser before the program exits


if __name__ = = "__main__":
Main ()

Selenuim&phantomjs&beautifulsoup Practice Classic Examples

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.