標籤:com 類比 FN webdriver 驅動 一起學 join div row
1 import requests 2 from lxml import etree 3 from bs4 import BeautifulSoup 4 import os 5 from selenium import webdriver 6 7 8 9 #解析每個漫畫分頁並下載漫畫 10 def manhua(url): 11 12 13 browser.get(url) 14 15 #擷取類比訪問的頁面源碼 16 html=browser.page_source 17 18 19 html = etree.HTML(html) 20 img_url = html.xpath(‘//img[@id="mangaFile"]/@src‘)[0] 21 alt = html.xpath(‘/html/body/div[2]/div[2]/h1/a/text()‘)[0] 22 title = html.xpath(‘/html/body/div[2]/div[2]/h2/text()‘)[0] 23 print(img_url,alt,title) 24 25 26 # print(html) 27 28 29 path=‘./漫畫/‘+alt+‘/‘+title+‘/‘ 30 if not os.path.exists(path): 31 os.makedirs(path) 32 fname=img_url.split(‘/‘)[-1] 33 # print(fname) 34 35 36 print(os.path.join(path,fname)) 37 38 # request.urlretrieve(img_url,os.path.join(path,fname)) 39 40 #請求圖片地址 41 response = requests.get(img_url) 42 #二進位解碼 43 data= response.content 44 #儲存檔案 45 with open(path+fname,‘wb‘) as f: 46 f.write(data) 47 #解析擷取漫畫分頁連結 48 def manhua_url(url): 49 response = requests.get(url) 50 response.encoding = response.apparent_encoding 51 html = response.text 52 html = etree.HTML(html) 53 # print(html) 54 #i為漫畫頁數 55 i = html.xpath(‘/html/body/div[2]/div[2]/span/text()‘)[1][1:-1] 56 i=int(i) 57 # print(i) 58 #找到分頁規律 59 #拼接分頁連結,選擇用format函數 60 url = url +‘/index.html?p={}‘ 61 # print(url) 62 for n in range(1,i+1): 63 fullurl = url.format(n) 64 print(fullurl) 65 # time.sleep(2) 66 #fullurl為所有的分頁漫畫連結 67 manhua(fullurl) 68 69 #解析列表頁 70 def list(lb_url): 71 response = requests.get(lb_url) 72 response.encoding = response.apparent_encoding 73 html = response.text 74 html = BeautifulSoup(html,‘lxml‘) 75 #匹配所有章節連結 76 url_list = html.select(‘div.subBookList ul li‘) 77 for url in url_list : 78 url = url.select(‘a‘)[0].get(‘href‘).split(‘/‘)[-2] 79 80 # print(url) 81 fullurl = os.path.join(lb_url,url) 82 print(fullurl) 83 #章節連結 84 manhua_url(fullurl) 85 86 # print(url_list) 87 # print(html) 88 89 #解析首頁 90 def shouye(): 91 #首頁連結 92 base_url = ‘http://www.omanhua.com/‘ 93 #發起請求 94 response = requests.get(base_url) 95 #解碼 96 response.encoding = response.apparent_encoding 97 #擷取返回的網頁 98 html = response.text 99 # print(html)100 #解析101 html =BeautifulSoup(html,‘lxml‘)102 #匹配最熱漫畫連結103 url_list = html.select(‘ul#cartoon_image_show1 li‘)104 for url in url_list:105 # print(url)106 url = url.select(‘a‘)[0].get(‘href‘)[1:]107 # alt = url.select(‘a‘)108 # print(alt)109 #拼接連結110 fullurl = os.path.join(base_url,url)111 print(fullurl)112 113 list(fullurl)114 if __name__ == ‘__main__‘:115 # 用自動化的測試模組selenium類比瀏覽器訪問,這裡用Google 圖片載入擷取不到圖片連結116 #後面的路徑是chorm驅動路徑117 browser = webdriver.Chrome(executable_path=r‘C:\Users\zhaozhi\Desktop\chromedriver.exe‘)118 shouye()
剛開始自學爬蟲不久,代碼可能寫的有點繁瑣,希望和大家一起學習學習進步
python爬取哦漫畫