Script one:
#!/usr/bin/env python#coding:utf-8from bs4 import beautifulsoupimport urllibimport reart = {}for page in Range (1,5):p age = str (page) url = ' http://yujianglei.blog.51cto.com/all/7215578/page/' + pageresponse = urllib.urlopen (URL). Read () Response = unicode (response, ' GBK '). Encode (' UTF-8 ') soup = beautifulsoup (response, ' Html.parser ') A_tag = soup.find_all ( Href=re.compile ("^/\d{7}/\d{7}$")) for i in a_tag:art_name = i.stringstring = str (i) art_url = ' http://yujianglei.blog.51cto.com ' + string[9:25]art[art_name ]=art_urlfor k,v in art.items ():p rint k, ' ', v# The above code: use a string, only print out the article and the article url###################################################################################### ####################
Script two:
From BS4 import beautifulsoupimport urllibimport reart = {}for page in range (1,5):p age = str (page) url = ' Http://yujiang lei.blog.51cto.com/all/7215578/page/' + pageresponse = Urllib.urlopen (URL). Read () response = Unicode (response, ' GBK '). Encode (' UTF-8 ') soup = beautifulsoup (response, ' html.parser ') A_tag = Soup.find_all (Href=re.compile ("^/\d{7}/\d{7}$") ) for i in a_tag:art_name = I.stringart_url = ' http://yujianglei.blog.51cto.com ' + i[' href ']art[art_name]=art_urlfor k,v In Art.items ():p rint K, ', v# above code: Beautiful Soup + string, print only the URL of the article and the article. ##############################################################################################################
Script Three:
from bs4 import beautifulsoupimport urllibimport reimport Xlsxwriterk_art_name = []v_art_url = []for page in range (1,5):p age =  STR (page) url = ' http://yujianglei.blog.51cto.com/all/7215578/page/' + Pagerequest = urllib.urlopen (URL) response = request.read () Response = unicode ( Response, ' GBK '). Encode (' UTF-8 ') soup = beautifulsoup (response, ' html.parser ') a_tag = Soup.find_all (Href=re.compile ("^/\d{7,}/\d{7,}$")) For i in a_tag:print iart_name = i.stringart_url = ' http://yujianglei.blog.51cto.com ' + i[' href ']k_art_ Name.append (Art_name) v_art_url.append (art_url) #文件的基本名称和文件类型描述workbook = xlsxwriter. Workbook (U ' 51cto blog. xlsx ') worksheet = workbook.add_worksheet (U ' in Jiang Lei ') title = [u ' article List ', U ' article connection '] #表头边框, background color, cell content location, cell font bold format_title = workBook.add_format () Format_title.set_border (1) format_title.set_bg_color (' #cccccc ') format_title.set_align (' Center ' ) Format_title.set_bold () format_title.set_size (#表题format_body = workbook.add_format () format_ Body.set_border () format_body.set_align (' left ') #单元格高度worksheet. Set_row (0,40) #单元格宽度worksheet. Set_column (' A:b ', 50 ) #写入文件标题worksheet. Write_row (' A1 ', Title,format_title) #写入文件主体worksheet. Write_column (' A2 ', k_art_name,format_body) Worksheet.write_column (' B2 ', V_art_url,format_body) workbook.close () #上面的代码加上字符串 +beautifulsoup, Save the article name and the URL of the article to an Excel file. ###########################################################################
This article from the "Do not ask for the best, only better" blog, please be sure to keep this source http://yujianglei.blog.51cto.com/7215578/1771058
Python crawler Crawl 51cto blog Daniel's article name and article URL