python爬取哦漫畫

來源:互聯網
上載者:User

標籤:com   類比   FN   webdriver   驅動   一起學   join   div   row   

  1 import requests  2 from lxml import etree  3 from bs4 import BeautifulSoup  4 import os  5 from selenium import webdriver  6   7   8   9 #解析每個漫畫分頁並下載漫畫 10 def manhua(url): 11  12  13     browser.get(url) 14  15     #擷取類比訪問的頁面源碼 16     html=browser.page_source 17  18  19     html = etree.HTML(html) 20     img_url = html.xpath(‘//img[@id="mangaFile"]/@src‘)[0] 21     alt = html.xpath(‘/html/body/div[2]/div[2]/h1/a/text()‘)[0] 22     title = html.xpath(‘/html/body/div[2]/div[2]/h2/text()‘)[0] 23     print(img_url,alt,title) 24  25  26     # print(html) 27  28  29     path=‘./漫畫/‘+alt+‘/‘+title+‘/‘ 30     if not os.path.exists(path): 31         os.makedirs(path) 32     fname=img_url.split(‘/‘)[-1] 33     # print(fname) 34  35  36     print(os.path.join(path,fname)) 37  38     # request.urlretrieve(img_url,os.path.join(path,fname)) 39  40     #請求圖片地址 41     response = requests.get(img_url) 42     #二進位解碼 43     data= response.content 44     #儲存檔案 45     with open(path+fname,‘wb‘) as f: 46         f.write(data) 47 #解析擷取漫畫分頁連結 48 def manhua_url(url): 49     response = requests.get(url) 50     response.encoding = response.apparent_encoding 51     html = response.text 52     html = etree.HTML(html) 53     # print(html) 54     #i為漫畫頁數 55     i = html.xpath(‘/html/body/div[2]/div[2]/span/text()‘)[1][1:-1] 56     i=int(i) 57     # print(i) 58     #找到分頁規律 59     #拼接分頁連結,選擇用format函數 60     url = url +‘/index.html?p={}‘ 61     # print(url) 62     for n in range(1,i+1): 63         fullurl = url.format(n) 64         print(fullurl) 65         # time.sleep(2) 66         #fullurl為所有的分頁漫畫連結 67         manhua(fullurl) 68  69 #解析列表頁 70 def list(lb_url): 71     response = requests.get(lb_url) 72     response.encoding = response.apparent_encoding 73     html = response.text 74     html = BeautifulSoup(html,‘lxml‘) 75     #匹配所有章節連結 76     url_list = html.select(‘div.subBookList ul li‘) 77     for url in url_list : 78         url = url.select(‘a‘)[0].get(‘href‘).split(‘/‘)[-2] 79  80         # print(url) 81         fullurl = os.path.join(lb_url,url) 82         print(fullurl) 83         #章節連結 84         manhua_url(fullurl) 85  86     # print(url_list) 87     # print(html) 88  89 #解析首頁 90 def shouye(): 91     #首頁連結 92     base_url = ‘http://www.omanhua.com/‘ 93     #發起請求 94     response = requests.get(base_url) 95     #解碼 96     response.encoding = response.apparent_encoding 97     #擷取返回的網頁 98     html = response.text 99     # print(html)100     #解析101     html =BeautifulSoup(html,‘lxml‘)102     #匹配最熱漫畫連結103     url_list = html.select(‘ul#cartoon_image_show1 li‘)104     for url in url_list:105         # print(url)106         url = url.select(‘a‘)[0].get(‘href‘)[1:]107         # alt = url.select(‘a‘)108         # print(alt)109         #拼接連結110         fullurl = os.path.join(base_url,url)111         print(fullurl)112 113         list(fullurl)114 if __name__ == ‘__main__‘:115     # 用自動化的測試模組selenium類比瀏覽器訪問,這裡用Google 圖片載入擷取不到圖片連結116     #後面的路徑是chorm驅動路徑117     browser = webdriver.Chrome(executable_path=r‘C:\Users\zhaozhi\Desktop\chromedriver.exe‘)118     shouye()

剛開始自學爬蟲不久,代碼可能寫的有點繁瑣,希望和大家一起學習學習進步

python爬取哦漫畫

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.