1 ImportRequests2 fromBs4ImportBeautifulSoup3 ImportBS44 5 #Crawl the contents of a directed Web page6 defgethtmltext (URL):7 Try:8r = requests.get (URL, timeout = 30)9 r.raise_for_status ()TenR.encoding =r.apparent_encoding One returnR.text A except: - Print('Error') - the - deffillunivlist (ulist, HTML): -Soup = BeautifulSoup (HTML,"Html.parser") - forTrinchSoup.find ('tbody'). Children:#traversing sub-tags under tbody + ifIsinstance (TR, bs4.element.Tag):#is a label type, not a string -TDS = TR ('TD')#shorthand, output list +Ulist.append ([Tds[0].string, Tds[1].string, tds[2].string]) A at - defprintunivlist (ulist, num): -TPLT ="{0:^10}\t{1:{3}^10}\t{2:^10}" #Use the width is not enough, Chinese space padding, ensure the Chinese alignment - Print(Tplt.format ("ranking","School Name","Provinces", Chr (12288))) - forIinchrange (num): -u=Ulist[i] in Print(Tplt.format (U[0],U[1],U[2],CHR (12288))) - to defMain (): +Uinfo = [] -URL ='http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html' theHTML =gethtmltext (URL) * fillunivlist (uinfo, HTML) $Printunivlist (Uinfo, 20)Panax Notoginseng -Main ()
Directed web crawler