Python網頁靜態爬蟲_

Python網頁靜態爬蟲__Python

最後更新：2018-07-30 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

本文基於慕課網的視頻教程，抓取百度百科中1000條詞條頁面資訊。

編程環境：Python3.5

抓取網頁資訊包括以下幾個部分：url管理器，下載器，解析器，輸出器：

（1）讀取要爬取的網頁URL，可命名為root_url

（2）解析root_url網頁中的內容，並將其中包含的其他url存進url管理器

（3）輸入HTML檔案，包含url,title,summary等資訊

下面將配合代碼的形式，講解如何爬取網頁資訊。

主函數：

# -*- coding:utf-8 -*-import url_manager     #匯入URL管理器import html_download   #匯入下載器import html_parser     #匯入解析器import html_outputer   #匯入輸出器class SpiderMain(object):    def __init__(self):  #建構函式初始化        self.urls=url_manager.UrlManage()          self.downloader=html_download.Downloader()        self.parser=html_parser.Parser()        self.outputer=html_outputer.Outputer()    def crawl(self,root_url):        count=1        self.urls.add_new_url(root_url)   #添加根url        while self.urls.has_new_url():    #判斷URL管理器中是否還存在URL（理論上是肯定存在的，因為每一次開啟一個網頁都會將其所有超連結存進URL管理器）            try:     #會出現不存在URL的情況                new_url=self.urls.get_new_url() #提取url                print ('crawl %d : %s'%(count,new_url))  #列印URL內容並計數                html_content=self.downloader.download(new_url) #下載URL的內容                urls,data=self.parser.parse(new_url,html_content)  #解析URL的內容，得到該URL網頁下的所有URL及該URL的標題與總結                self.urls.add_new_urls(urls)  #將上一步得到的所有URL添加進URL容器，方便迴圈調用                self.outputer.collect_data(data)  #收集資料，為下一步匯出到HTML檔案中做準備                if count==1000:  #爬取1000個URL                    break                count+=1                    except:                print ("crawl failed")        self.outputer.output()  #輸出器，將爬取到的內容輸出到html檔案中if __name__=="__main__":  #主函數    root_url="http://baike.baidu.com/view/21087.htm"  #根url    obj_spider=SpiderMain()      obj_spider.crawl(root_url)  #執行crawl函數

url管理器：

class UrlManage(object):    def __init__(self):        self.new_urls=set()        self.old_urls=set()    def add_new_url(self,url): #添加新的URL（一次添加一條URL）        if url is None:            return        if url not in self.new_urls and url not in self.old_urls: #說明該url既不在待爬取的URL列表裡，也不在爬取過的URL列表裡面            self.new_urls.add(url)    def has_new_url(self): #判斷是否含有URL        return len(self.new_urls)!=0    def get_new_url(self):  #提取URL給後續解析，並將其從new_urls剔除，存進old_urls        new_url=self.new_urls.pop()        self.old_urls.add(new_url)        return new_url    def add_new_urls(self,urls):  #將待爬取網頁的所有超連結匯入new_urls集合中        if urls is None or len(urls)==0:            return        for url in urls:            self.add_new_url(url)

下載器：

from urllib import requestclass Downloader():    def download(self,url):        if url is None:            return        response=request.urlopen(url) #開啟url          if response.getcode()!=200:   #如果response.getcode()!=200，說明爬取失敗            return None        return response.read()        #讀取url內容，包括整個網頁資訊（html形式）

解析器：

from bs4 import BeautifulSoup  #使用網頁解析器BeautifulSoup4解析下載後的資訊import refrom urllib import parseclass Parser():    def get_urls(self,page_url,soup):        urls=set()        #href格式為/view/123.htm        links=soup.find_all('a',href=re.compile(r'/view/\d+\.htm')) #使用正則化，將所有href格式為/view/123.htm的url都儲存下來        for link in links:            new_url=parse.urljoin(page_url,link['href'])  #需要補全href格式，使用urljoin拼接兩個url，得到一個完整的解析後的url            urls.add(new_url)   #將解析後的url添加進urls中        return urls    def get_data(self,page_url,soup): #提取url的title及summary        data={}        data['url']=page_url        title=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")        data['title']=title.get_text()        summary=soup.find('div',class_="lemma-summary")        data['summary']=summary.get_text()        return data    def parse(self,page_url,html_cont):        if page_url is None or html_cont is None:            return        soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf8')        urls=self.get_urls(page_url,soup)  #將解析後的url存入urls中        data=self.get_data(page_url,soup)  #將url的title及summary存入data中        return urls,data

輸出器：

# -*- coding:utf-8 -*-import stringclass Outputer():    def __init__(self):        self.datas=[]    def collect_data(self,data):        if data is None:            return        self.datas.append(data)    def output(self):        fout=open('output.html','w',encoding='utf-8') #建立html檔案        fout.write('<html>')        fout.write("<head>")   #這三行是為瞭解決HTML檔案輸入中文亂碼        fout.write('<meta charset="utf-8">')  #這三行是為瞭解決HTML檔案輸入中文亂碼        fout.write("</head>")  #這三行是為瞭解決HTML檔案輸入中文亂碼        fout.write('<body>')           fout.write('<table>')  #建立表格形式        for data in self.datas:            fout.write('<tr>')  #建立行            fout.write('<td>%s</td>' % data['url'])  #建立儲存格            fout.write('<td>%s</td>' % data['title'])            fout.write('<td>%s</td>' % data['summary'])            fout.write('</tr>') #關閉行        fout.write('</table>')  #關閉表格        fout.write('</body>')        fout.write('</html>')        fout.close()

對網頁解析器BeautifulSoup的補充說明，舉例如下：

import refrom bs4 import BeautifulSouphtml_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""soup=BeautifulSoup(html_doc,'html.parser',from_encoding='utf8')                       #html文檔字串    #html解析器   #html文檔的編碼#搜尋節點：find_all(name,attrs,string)#尋找所有標籤為a的節點print ('擷取所有帶a的連結')links=soup.find_all('a')for link in links:    print (link.name,link['href'],link.get_text())#尋找所有標籤為a，連結符合'/view/123.htm'形式的節點#soup.find_all('a',href='/view/123.htm')print ('擷取帶tillie的連結')link1=soup.find('a',href='http://example.com/tillie')print (link1.name,link1['href'],link1.get_text())#尋找所有標籤為a，class為abc,文字為Python的節點soup.find_all('a',class_='abc',string='Python')print ('擷取帶lsi的連結')link2=soup.find('a',href=re.compile(r'lsi'))  #使用Regexprint (link2.name,link2['href'],link2.get_text())print ('擷取帶p的連結')link3=soup.find('p',class_="story")print (link3.name,link3.get_text())

得到結果如下：

擷取所有帶a的連結a http://example.com/elsie Elsiea http://example.com/lacie Laciea http://example.com/tillie Tillie擷取帶tillie的連結a http://example.com/tillie Tillie擷取帶lsi的連結a http://example.com/elsie Elsie擷取帶p的連結p Once upon a time there were three little sisters; and their names wereElsie,Lacie andTillie;and they lived at the bottom of a well.

參考自：Python爬蟲----網頁解析器和BeautifulSoup第三方模組

接著，對urljoin函數的補充說明，舉例如下：

#urlparse解析器簡單舉例from urllib import parseprint (parse.urljoin('http://baike.baidu.com/view/21087.htm','/view/53557.htm')) #這裡要注意“/”的使用，可以自己嘗試看看具體用法

得到結果：

http://baike.baidu.com/view/53557.htm

一開始，按照視頻教程寫代碼，發現html檔案的中文讀取不到，或者出現中文亂碼的情況，為此在輸出器中添加了

fout.write("<head>")   #這三行是為瞭解決HTML檔案輸入中文亂碼        fout.write('<meta charset="utf-8">')  #這三行是為瞭解決HTML檔案輸入中文亂碼        fout.write("</head>")  #這三行是為瞭解決HTML檔案輸入中文亂碼

參考自：爬蟲顯示中文亂碼

最後，爬出得到的結果是（html檔案）：

以上，便是我根據慕課網視頻教程Python開發簡單爬蟲整理的資料，以及本人調試過程中，所遇到的問題及其解決方案。

附上完整代碼，直接執行htmlpc檔案就行。

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More