標籤:[] list tag UI time inf for int timeout
爬取的是 ‘’最好大學網‘’,提取2017年排名前20名大學的名稱和分數
1 #coding: utf-8 2 import requests 3 from bs4 import BeautifulSoup 4 import bs4 5 6 def getHTMLText(url): 7 try: 8 r = requests.get(url, timeout=30) 9 r.raise_for_status()10 r.encoding = r.apparent_encoding11 return r.text12 except:13 return "fail"14 15 def fillUnivList(ulist, html):16 soup = BeautifulSoup(html, "html.parser")17 for tr in soup.find(‘tbody‘).children:18 if isinstance(tr, bs4.element.Tag):19 tds = tr(‘td‘)20 ulist.append([tds[0].string, tds[1].string, tds[3].string])21 22 def printUnivList(ulist, num):23 tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"24 for i in range(num):25 u=ulist[i]26 print(u[1],u[2])27 28 def main():29 uinfo = []30 url = ‘http://www.zuihaodaxue.cn/zuihaodaxuepaiming2017.html‘31 html = getHTMLText(url)32 fillUnivList(uinfo, html)33 printUnivList(uinfo, 20) 34 35 main()
結果:
python自學2——爬蟲