Import re
From Urllib Import Request
Class Sprder:
def __init__ (self):
Self.page=1
Self.switch=true
def loadPage (self):
""""
Download page
"""
Url= "http://www.neihan8.com/article/list_5_" +str (self.page) + ". html"
User_agent = ' mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; trident/5.0 '
headers = {' User-agent ': user_agent}
Request1=request. Request (Url,headers=headers)
Response=request.urlopen (Request1)
Html=response.read (). Decode ("GBK")
Pattern=re.compile (R ' <div\sclass= "F18 mb20" > (. *?) </div> ', Re. S
Content_list=pattern.findall (HTML)
Self.dealpage (Content_list)
def dealpage (self,content_list):
"""
Working with every page of satin
"""
For item in Content_list:
Item=item.replace ("<p>", ""). Replace ("</p>", ""). Replace ("<br>", ""). Replace ("<br/>", ""). Replace ("“", "")
Self.writepage (item)
def writepage (Self,item):
"""
Write the jokes to the file one by one
"""
With open ("Satin. txt", "a") as F:
F.write (item)
def startwork (self):
"""
Control crawler operation
"""
While Self.switch:
Self.loadpage ()
COMMAND=STR (Input ("If you continue to press ENTER (Exit input quit)")
If command== "quit":
Self.switch=false
Self.page+=1
if __name__ = = ' __main__ ':
Duanzispider=sprder ()
# duanzispider.loadpage ()
Duanzispider.startwork ()
Python3 Reptile connotation Satin