# Coding=utf-8
Import Urllib
Import Urllib2
Import re
Import Thread
Import time
Class QSBK:
def __init__ (self):
Self.pageindex = 1
Self.user_agent = ' mozilla/5.0 (Windows NT 10.0; Win64; x64) '
Self.headers = {' user-agent ': self.user_agent}
Self.stories = []
Self.enable = False
def getpage (Self,pageindex):
Try
url = ' http://www.qiushibaike.com/hot/page/' + str (pageIndex)
Request = Urllib2. Request (Url,headers=self.headers)
Response = Urllib2.urlopen (Request)
Pagecode = Response.read (). Decode (' Utf-8 ')
Return Pagecode
Except Urllib2. Urlerror,e:
If Hasattr (E, "Reason"):
Print "Error", E.reason
Return None
def getpageitems (Self,pageindex):
Pagecode = Self.getpage (PageIndex)
If not pagecode:
Print "Page load Error"
Return None
Pattern = Re.compile (' h2> (. *?)
Items = Re.findall (Pattern,pagecode)
Pagestories = []
For item in items:
Pagestories.append ([Item[0].strip (), Item[1].strip (), Item[2].strip ()])
Return pagestories
def loadPage (self):
If Self.enable==true:
If Len (self.stories) <2:
Pagestories = Self.getpageitems (Self.pageindex)
If pagestories:
Self.stories.append (pagestories)
Self.pageindex +=1
def getonestory (self,pagestories,page):
For stories in Pagestories:
input = Raw_input ()
Self.loadpage ()
if input = = "Q":
Self.enable = False
Return
Print U "page%d \ t" Publisher:%s\t:%s\n%s "% (Page,story[0],story[2],story[1])
def start (self):
Print U ' is reading, enter view, Q exit '
Self.enable = True
Self.loadpage ()
Nowpage = 0
While self.enable:
If Len (self.stories) >0:
Pagestories = Self.stories[0]
Nowpage +=1
Del Self.stories[0]
Self.getonestory (Pagestories,nowpage)
Spider = QSBK ()
Spider.start ()
C:\python.exe c:/python_test/qiubai.py
Reading, enter view, Q exit
1th page Publisher: Anonymous user likes: 1909
Follow the back and feel a lot of pressure.
The 1th page of the publisher: Flowers and wind to fly like: 440
I just forgot to bring you supper, you need to look at me with such eyes ...
Contribute to a learning process of crawling embarrassing content