Get all of the http://www.qiushibaike.com/textnew/'s jokes and save them to a total of 35 pages in the local page.
Apart on the code, regular expressions need to be studied.
Site Source Fragments:
<Ahref= "/users/32215536/"target= "_blank"Title= "eat two bowls and Sheng" ><H2> eat two bowls and Sheng </H2></A><Divclass= "Articlegender Manicon" >38</Div></Div><Ahref= "/article/119080581"target= "_blank"class= ' Contentherf ' ><Divclass= "Content" ><Span> a sigh! <Br/> more than 20 years ago, a humble rural primary school in a class, there are two boys sitting on the same bench, this is my deskmate and I. <Br/> Deskmate is the enemy, occasionally he returned to the seat, I quietly force, so that one side of the table slightly cocked, and then quietly back to rotate, table sit empty, fell on the ground, I stole the music! <Br/> the same trick can not always use, but when children know this. That time, I have the same old tricks, how to expect the deskmate early, leg back kicking, stool turned up, I successfully sat on the ground. <Br/> to the present, afraid of cloudy day! </Span></Div></A><Divclass= "Stats" ><Spanclass= "Stats-vote" ><Iclass= "Number" >62</I> Funny </Span><Spanclass= "Stats-comments" ></Span></Div><DivId= "qiushi_counts_119080581"class= "Stats-buttons bar Clearfix" ><Ulclass= "Clearfix" ><LiId= "vote-up-119080581"class= "Up" ><AHref= "javascript:voting (119080581,1)"class= "Voting"Data-article= "119080581"Id= "up-119080581"Rel= "nofollow" ><i></I><Spanclass= "Number hidden" >68</Span></A></Li><LiId= "vote-dn-119080581"class= "Down" ><AHref= "javascript:voting (119080581,-1)"class= "Voting"Data-article= "119080581"Id= "dn-119080581"Rel= "nofollow" ><i></I><Spanclass= "Number hidden" >-6</Span></A></Li><Liclass= "Comments" ><Ahref= "/article/119080581"Id= "c-119080581"Class= "Qiushi_comments"target= "_blank" ><i></I></A></Li></Ul></Div><Divclass= "Single-share" ><AClass= "Share-wechat"Data-type= "WeChat"title= "Share to"Rel= "nofollow" ></A><AClass= "SHARE-QQ"Data-type= "QQ"Title= "Share to QQ"Rel= "nofollow" >qq</A><AClass= "Share-qzone"Data-type= "Qzone"Title= "Sharing to QQ space"Rel= "nofollow" >qq space </A><AClass= "Share-weibo"Data-type= "Weibo"Title= "Share to Weibo"Rel= "nofollow" > Weibo </A></Div><Divclass= "Single-clear" ></Div></Div>< Span id= "line1430" ><div class= "article block untagged mb15" id= ' qiushi_tag_119080574 ' ><div class = "Author Clearfix" ><a href= "/users/31546279/" target= "_blank" rel= "nofollow" ></a><a href="/users/31546279/"target=" _blank "title="? Farthest distance "><H2>? Farthest distance
Python code
# Coding=utf-8
Import Urllib2
Import re
Import OS
Class Spider (Object):
Def__INIT__ (Self):
Self.url =' http://www.qiushibaike.com/textnew/page/%s?s=4832452 '
Self.user_agent=' Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) gecko/20100101 firefox/53.0 '
Def get_page (Self,page_index):
headers= {' User-agent ':Self.user_agent}
Try
URL =Self.url%STR (PAGE_INDEX)
Request= Urllib2. Request (URL,Headers=headers)
response= Urllib2.urlopen (Request)
Content= Response.read ()
return content
#print ' content: ' + content
Except Urllib2. HttperrorAs e:
Print E
Exit ()
Except Urllib2. UrlerrorAsE:
Print' Urlerror ' +Self.url
Exit ()
#分析网页源码
DEF analysis (Self,content):
#pattern = Re.compile (' <div class= ' content ' > (. *?) <!--(. *?) -->.*?</div> ', Re. S
Pattern = Re.compile (' <div class= ' content >.*?</div> ', re. S
#res_value = R ' <span .*?> (. *?) </span> '
items= Re.findall (pattern,content)
return items
#保存抓取内容
def save (Self,items,i,path):
If not os.path.exists (path):
Os.makedirs (PATH)
File_path = path +'/' +STR (i) +'. txt '
f =Open (File_path,' W ')
For itemIn items:
if __name__ = =' __main__ ':
item_new= item.Replace‘\ n‘,‘‘).Replace' <br/> ',‘\ n‘).Replace' <div class= ' content > ',‘‘).Replace' </div> ',‘‘).Replace' </span> ',‘\ n‘).Replace' <span> ',‘\ n‘)
F.write (item_new)
F.close ()
def run (Self):
Print Span style= "color: #000080; Font-weight:bold ">for i in range ( Span style= "COLOR: #0000ff" >1,35):
content= self.get _page (i)
items= self.analysis (content)
Self.save (Items,i,/users/huangxuelian/downloads/41527218/ Pythontest ')
print if __name__ == ' __main__ ':
Spider = Spider ()
Spider.run ()
Python web crawler