1 #!/usr/bin/env Python32 #-*-coding:utf-8-*-3 ############################4 #File Name:zuihaodaxuepaiming.py5 #Author:frank6 #Mail: [email protected]7 #Created time:2018-05-15 21:45:558 ############################9 Ten ImportRequests One fromBs4ImportBeautifulSoup A - - #1. Get the contents of an HTML page the defgethtmltext (URL): - Try: -R =requests.get (URL) - r.raise_for_status () + #r.encoding = r.apparent_encoding -R.encoding ='Utf-8' + returnR.text A at except: - Print("Crawl Exception") - return "" - - #2. Crawl University ranking information from HTML page content (rank, school name, total score) - defgetunivinfo (ulst, HTML): inSoup = BeautifulSoup (HTML,"lxml") - #get the number of universities in the rankings to Print(Len (Soup.tbody.find_all ("TR", recursive=False ))) + #list the child nodes of the tbody - #for tag in Soup.tbody.find_all (' tr ', Recursive=false): the ##print ("name:{},type:{}". Format (tag.name, type (tag))) * #print (tag.td.string) $ forTrinchSoup.tbody.find_all ('TR', Recursive=false,limit=1):Panax NotoginsengTDS = TR ('TD') - #print (Tr.prettify ()) the #Print (TDS) + #print (tds[0].string,tds[1].string, tds[3].string) AUlst.append ([Tds[0].string,tds[1].string, Tds[3].string]) the + #3. Show ranking Information - #def printunivinfo (Ulst, num): $ #print ("{0:^10}\t{1:{3}^10}\t{2:^10}". Format ("Rank", "School name", "Total Score", Chr (12288))) $ #For Uni in Ulst[0:num]: - #print ("{0:^10}\t{1:{3}^10}\t{2:^10}". Format (UNI[0],UNI[1],UNI[2],CHR (12288))) - the defprintunivinfo (Ulst, num): -Ptstr ="{0:^10}\t{1:{3}^10}\t{2:^10}"Wuyi Print(Ptstr.format ("ranking","School Name","Total", Chr (12288))) the forUniinchUlst[0:num]: - Print(Ptstr.format (UNI[0],UNI[1],UNI[2],CHR (12288)) #中文字符的空格填充 chr (12288) Wu - defMain (): AboutURL ="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html" $Ulst = [] -HTML =gethtmltext (URL) - #print (HTML) - Print("Gethtmltext") A getunivinfo (ulst, HTML) + Print("Getunivinfo") thePrintunivinfo (Ulst, 10) - $Main ()
This example is mainly for the application of requests and BeautifulSoup.
Python's Chinese University crawler