技術選型
下載器是Requests
解析使用的是Regex
效果圖:
準備好各個包
# -*- coding: utf-8 -*-import requests #第三方下載器import re #Regeximport json #格式化資料用from requests.exceptions import RequestException #做異常處理from multiprocessing import Pool #使用多進程
開始編寫代碼,new一個py檔案 1.requests下載頁面
response =requests.get(url)url:當前需要爬取的連結requests.get()擷取頁面
這裡需要注意編碼的問題;
就像下面這樣:
response = requests.get(url) if response.status_code == 200: return response.content.decode("utf-8") return None
這樣返回的就是一個string類型的資料 2.except RequestException:捕捉異常
為了代碼更加健壯,我們在可能發生異常的地方做異常捕獲
try: response = requests.get(url) if response.status_code == 200: return response.content.decode("utf-8") return None except RequestException: return None
更多異常介紹官網
http://www.python-requests.org/en/master/_modules/requests/exceptions/#RequestException
到這裡,我們就可以編寫main方法進行調用程式了
代碼如下:
# -*- coding: utf-8 -*-import requestsfrom requests.exceptions import RequestExceptiondef get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.content.decode("utf-8") return None except RequestException: return Nonedef main(): url = 'https://coding.imooc.com/?page=1' html = get_one_page(url) print(html)if __name__ == '__main__': main()
這樣就可以把頁面下載下來了 接著,就是解析頁面 3.Regex介紹 re.compile()方法:編譯Regex
通過一個Regex字串 編譯產生 一個字串對象 re.findall(pattern,html)方法:找到所有匹配的內容
參數:
pattern:編譯過的Regex
html:用response.content.decode(“utf-8”)得到的頁面內容
def parse_one_page(html): pattern = re.compile('<div class="box">.*?lecturer-info.*?<span>(.*?)</span>.*?shizhan-intro-box.*?title=".*?">' '(.*?)</p>.*?class="grade">(.*?)</span>.*?imv2-set-sns.*?</i>' '(.*?)</span>.*?class="big-text">(.*?)</p>.*?shizan-desc.*?>' '(.*?)</p>.*?</div>',re.S) items = re.findall(pattern,html) for item in items: #格式化每一條資料為字典類型的資料 yield { 'teacher': item[0], 'title': item[1], 'grade': item[2], 'people':item[3], 'score': item[4], 'describe': item[5] }
完整代碼:
# -*- coding: utf-8 -*-import requestsimport refrom requests.exceptions import RequestExceptiondef get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.content.decode("utf-8") return None except RequestException: return Nonedef parse_one_page(html): pattern = re.compile('<div class="box">.*?lecturer-info.*?<span>(.*?)</span>.*?shizhan-intro-box.*?title=".*?">' '(.*?)</p>.*?class="grade">(.*?)</span>.*?imv2-set-sns.*?</i>' '(.*?)</span>.*?class="big-text">(.*?)</p>.*?shizan-desc.*?>' '(.*?)</p>.*?</div>',re.S) items = re.findall(pattern,html) for item in items: yield { 'teacher': item[0], 'title': item[1], 'grade': item[2], 'people':item[3], 'score': item[4], 'describe': item[5] }def main(): url = 'https://coding.imooc.com/?page=1' html = get_one_page(url) for item in parse_one_page(html): print(item)if __name__ == '__main__': main()
儲存解析後的資料到本地檔案
4.儲存檔案操作
with open('imooctest.txt','a',encoding='utf-8') as f with as :開啟自動閉合的檔案並設立對象f進行操作 參數: imooctest.txt:檔案名稱字 a:追加方式 encoding:編碼格式 不這樣設定可能儲存的資料會亂碼 f.write(json.dumps(content,ensure_ascii =False)+'\n') json.dumps:將剛才被格式化後的字典轉為字串 ensure_ascii =False 不這樣設定可能儲存的資料會亂碼 +'\n' 每條資料為一行
代碼如下:
# -*- coding: utf-8 -*-import requestsimport reimport jsonfrom requests.exceptions import RequestExceptiondef get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.content.decode("utf-8") return None except RequestException: return Nonedef parse_one_page(html): pattern = re.compile('<div class="box">.*?lecturer-info.*?<span>(.*?)</span>.*?shizhan-intro-box.*?title=".*?">' '(.*?)</p>.*?class="grade">(.*?)</span>.*?imv2-set-sns.*?</i>' '(.*?)</span>.*?class="big-text">(.*?)</p>.*?shizan-desc.*?>' '(.*?)</p>.*?</div>',re.S) items = re.findall(pattern,html) for item in items: yield { 'teacher': item[0], 'title': item[1], 'grade': item[2], 'people':item[3], 'score': item[4], 'describe': item[5] }def write_to_file(content): with open('imooctest.txt','a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)+'\n') f.close()def main(): url = 'https://coding.imooc.com/?page=1' html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item)if __name__ == '__main__': main()
5.爬取所有頁面並以多進程方式
分析頁面,會發現,需要爬取的頁面如下
https://coding.imooc.com/?page=1https://coding.imooc.com/?page=2https://coding.imooc.com/?page=3https://coding.imooc.com/?page=4
我們需要構造這種格式的頁面
url = ‘https://coding.imooc.com/?page=‘+str(page)
主函數可以類似這樣:
for i in range(4):
main(i+1)
完整代碼:
# -*- coding: utf-8 -*-import requestsimport reimport jsonfrom requests.exceptions import RequestExceptionfrom multiprocessing import Pooldef get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.content.decode("utf-8") return None except RequestException: return Nonedef parse_one_page(html): pattern = re.compile('<div class="box">.*?lecturer-info.*?<span>(.*?)</span>.*?shizhan-intro-box.*?title=".*?">' '(.*?)</p>.*?class="grade">(.*?)</span>.*?imv2-set-sns.*?</i>' '(.*?)</span>.*?class="big-text">(.*?)</p>.*?shizan-desc.*?>' '(.*?)</p>.*?</div>',re.S) items = re.findall(pattern,html) for item in items: yield { 'teacher': item[0], 'title': item[1], 'grade': item[2], 'people':item[3], 'score': item[4], 'describe': item[5] }def write_to_file(content): with open('imoocAll.txt','a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)+'\n') f.close()def main(page): url = 'https://coding.imooc.com/?page='+str(page) html = get_one_page(url) # parse_one_page(html) # print(html) for item in parse_one_page(html): print(item) write_to_file(item)if __name__ == '__main__': pool = Pool() pool.map(main,[i+1 for i in range(4)]) # for i in range(4): # main(i+1)
到這裡,我們就能夠把慕課網上面的全部實戰課程的資訊爬取下來,拿到這些資料,你就可以做自己喜愛的分析了