1, the use of tools:
Python3.5
BeautifulSoup
2, crawl the site:
CSDN List of popular articles http://blog.csdn.net/hot.html
3. Implementation code:
__author__ =' Administrator 'ImportUrllib.requestImportRe fromBs4ImportBeautifulSoup########################################################## Crawl Csdn Home article http://blog.csdn.net/?&page=1########################################################### class csdnutils(object): def __init__(self):User_agent =' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/50.0.2661.102 safari/537.36 'Self.headers = {' Cache-control ':' max-age=0 ',' Connection ':' keep-alive ',' Accept ':' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 ',' User-agent ': User_agent,} def getpage(self, url=none):Request = Urllib.request.Request (URL, headers=self.headers) response = Urllib.request.urlopen (Request) Soup = BeautifulSoup (Response.read (),"Html.parser")#print (Soup.prettify ()) returnSoup def parsepage(self, Url=none, page=none):Soup = self.getpage (URL) itemblog = Soup.find_all (' div ',' Blog_list ') Cnarticle = csdnutils print ("======================== First", page,"Page ======================================") forI, ItemsingleinchEnumerate (itemblog): Cnarticle.num = i cnarticle.author = Itemsingle.find (' A ',' user_name '). String cnarticle.posttime = Itemsingle.find (' span ',' Time '). String Cnarticle.articleview = Itemsingle.find (' A ',' View '). StringifItemsingle.find (' H1 '). Find (' A '). Has_attr (' class '): Cnarticle.type = Itemsingle.find (' H1 '). Find (' A ',' category '). StringElse: Cnarticle.type ="None"Cnarticle.title = Itemsingle.find (' H1 '). Find (' A ', attrs={' name ':True}). String cnarticle.url = Itemsingle.find (' H1 '). Find (' A ', attrs={' name ':True}). Get ("href") Print ("Data:", Cnarticle.num +1,' \ t ', Cnarticle.author,' \ t ', Cnarticle.posttime,' \ t ', Cnarticle.articleview,' \ t ', Cnarticle.type,' \ t ', Cnarticle.title,' \ t ', Cnarticle.url)####### execution Entrance ########if__name__ = ="__main__":#要抓取的网页地址 ' http://blog.csdn.net/?&page={} '. Format (i+1), i+1)URL ="Http://blog.csdn.net/hot.html"Cnblog = Csdnutils () forIinchRange0,5): Cnblog.parsepage (URL, i +1)
4. Execution Result:
Python implementation crawl csdn list of popular articles