Tag:python crawler
# coding=utf-8import Urllib2import urllibimport reclass qiushi:def _init_ (self): self.page = 1 # Get embarrassing from web page def G Etqiushis (self,page): #网址 url = "http://www.qiushibaike.com/hot/page/" +page #伪装浏览器 user_agent = ' mozilla/4.0 ( Compatible MSIE 5.5; Windows NT) ' headers = {' User-agent ': user_agent} #请求 req = Urllib2. Request (url,headers = headers) Response = Urllib2.urlopen (req) HTML = response.read () #encode的作用是将unicode编码 Convert to another encoded string #decode的作用是将其他编码的字符串转换成unicode编码 unicodehtml = Html.decode ("utf-8") items = Re.findall (' &L t;div.*?class= "Content". *?title= "(. *?)" > (. *?) </div> ', Unicodehtml,re. S) contents = [] for item in items: # The first of the item is the title of the Div, that is, the second of the time # item is the content of the Div, which is the inner Capacity Contents.append ([Item[0].replace ("\ n", ""), Item[1].replace ("\ n", "")]) return contents #打印糗事 def Showqiushi (self,contents): Count = 1 for content in Contents:print "Article%d embarrassing thing "% count,content[0]," \ n "Print content[1]," \ n "Count + = 1 #启动 def Start (self): page = 1 The while page < 5:print "page%d: \ n"% page contents = self. Getqiushis (str (page)) self. Showqiushi (contents) page + = 1 Qiushi = Qiushi () Qiushi.start ()
[Python] Crawl embarrassing encyclopedia