1, the way to climb Python+selenium
2. Work Flow
Selenium automatic input, auto crawl, create folder, store magnetic link to Notepad
3, paste the code
#!/usr/bin/python
#-*-coding:utf-8-*-
from selenium import Webdriver
from Selenium.webdriver.common.keys import keys
from selenium.webdriver.common.action_chains import Actionchains
From BS4 import beautifulsoup
import OS
Import urllib2
Import time
Import random
Import re
browser = Webdriver. Chrome ()
#browser. Set_window_position (+)
#browser. Set_window_size (1100)
Browser.maximize_ Window () #最大化
#隐式等待
browser.implicitly_wait
browser.get (' http://www.dytt8.net/')
Browser.find_ Element_by_xpath ('//*[@id = "header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input '). Clear ( )
Browser.find_element_by_xpath ('//*[@id = ' header ']/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p [1]/input '). Send_keys (U ' terror ')
def close (browser):
# Gets the current window handle (window a)
Handle = Browser.current_window_handle
# gets all current window handles (Windows A, B)
Handles = Browser.window_handles
# Traverse the window
For Newhandle in handles:
# Filter The newly opened window b
If Newhandle!=handle:
Browser.switch_to_window (Newhandle)
# Close the current window B
Browser.close ()
Browser.switch_to_window (Handles[0])
def change (browser):
# Gets the current window handle (window a)
Handle = Browser.current_window_handle
# gets all current window handles (Windows A, B)
Handles = Browser.window_handles
# Traverse the window
For Newhandle in handles:
# Filter The newly opened window b
If Newhandle!=handle:
Browser.switch_to_window (Newhandle)
def back (browser):
# Gets the current window handle (window a)
Handle = Browser.current_window_handle
# gets all current window handles (Windows A, B)
Handles = Browser.window_handles
# Traverse the window
For Newhandle in handles:
# Filter The newly opened window b
If Newhandle==handle:
Browser.switch_to_window (Newhandle)
# Close the current window B
Browser.close ()
Browser.switch_to_window (Handles[0])
def backn (browser):
# Gets the current window handle (window a)
Handle = Browser.current_window_handle
# gets all current window handles (Windows A, B)
Handles = Browser.window_handles
# Traverse the window
For Newhandle in handles:
# Filter The newly opened window b
If Newhandle!=handle:
Browser.switch_to_window (Newhandle)
# Close the current window B
Browser.close ()
Browser.switch_to_window (Handles[1])
Close (browser)
Browser.find_element_by_xpath ('//*[@id = "header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/ Input '). Clear ()
Browser.find_element_by_xpath ('//*[@id = "header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/ Input '). Send_keys (U ' terror ')
Ele = Browser.find_element_by_xpath ('//*[@id = "header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[2]/ Input ')
# #直接点击不了
#模拟点击事件
Ele.send_keys (Keys.enter)
# #成功跳转到页面
obj = BeautifulSoup (browser.page_source, ' Html.parser ')
def fun (obj, num):
list = []
List = Obj.find (' div ', {' class ': ' Co_content8 '}). Find_all (' table ')
For I, V in Enumerate (list):
If i<=9:
href = V.find (' a '). Get (' href ')
title = V.find (' a '). Text
# #去掉特殊的符号
title = Re.sub (' [\:*? ') <>|] ', '-', title '
Disk_url = ' e:/test/dytt/bt/' +title+ '
#开始创建文件夹
If Os.path.exists (' e:/test/dytt/bt/' +title+ '):
print ' This folder already exists! '
Else
Os.mkdir (R ' e:/test/dytt/bt/' +title+ ')
Print title
#url = ' http://www.ygdy8.com ' +href+ '
# # #打开一个新窗口
JS = "window.open (' http://www.ygdy8.com" +href+ "')"
Browser.execute_script (JS)
# #跳转到新页面
#browser. Get (URL)
#切换到b窗口
Change (browser)
#右键点击那个链接
Try
QQQ =browser.find_element_by_xpath ('//*[@id = ' Zoom ']/span/table/tbody/tr/td/a ')
Actionchains (browser). Context_click (QQQ). Perform ()
HREFs = Browser.find_element_by_xpath ('//*[@id = "Zoom"]/span/table/tbody/tr/td/a '). Get_attribute (' href ')
Print HREFs
File = open (' e:\\test\\dytt\\bt\\ ' +title+ ' \\bt.txt ', ' W ')
File.write (HREFs)
File.close ()
Except
print ' WE can try another way! '
Try
QQQ =browser.find_element_by_xpath ('//*[@id = ' Zoom ']/span/div[5]/table/tbody/tr/td/a ')
Actionchains (browser). Context_click (QQQ). Perform ()
HREFs = Browser.find_element_by_xpath ('//*[@id = "Zoom"]/span/div[5]/table/tbody/tr/td/a '). Get_attribute (' href ')
Print HREFs
File = open (' e:\\test\\dytt\\bt\\ ' +title+ ' \\bt.txt ', ' W ')
File.write (HREFs)
File.close ()
Except
print ' This is a game! '
Back (browser)
#循环完之后
If num==0:
Browser.find_element_by_xpath ('//*[@id = "header"]/div/div[3]/div[3]/div[2]/div[2]/div[2]/ul/table[11]/tbody/tr/ td[9]/a '). Click ()
Else
Browser.find_element_by_xpath ('//*[@id = "header"]/div/div[3]/div[3]/div[2]/div[2]/div[2]/ul/table[11]/tbody/tr/ td[10]/a '). Click ()
Change (browser)
Backn (browser)
obj = BeautifulSoup (browser.page_source, ' Html.parser ')
Fun (obj, 1)
def get_html (URL):
"Get HTML"
# #定义headers
User_agent= "mozilla/5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/51.0.2704.103 safari/537.36 "
headers={"User-agent": User_agent}
Request = Urllib2. Request (URL, headers=headers)
#request. Encoding = ' Utf-8 '
Try
html = urllib2.urlopen (Request). Read ()
Except Urllib2. Urlerror as E:
Print url+ ' Download error: ', E.reason
HTML = None
return HTML
Fun (obj, 0)
Python crawls movie paradise horror movies + Games