Code:
#_*_ coding:utf-8 _*_ImportUrllib2ImportRe#Import SYS#Reload (SYS)#sys.setdefaultencoding (' utf-8 ') classtool:removeimg= Re.compile (r'<p class= "F_center" .*?</p>') removeaddr= Re.compile (r'<a.*?>|</a>') ReplaceLine= Re.compile (r'<tr>|<div>|</div>|</p>') replacetd= Re.compile (r'<td>') Replacepara= Re.compile (r'<p.*?>') Replacebr= Re.compile (r'<br<br>|<br>') Removeextratag= Re.compile (r'<.*?>') defreplace (self,text): Text= Re.sub (self.removeimg,"", text) text= Re.sub (SELF.REMOVEADDR,"", text) text= Re.sub (Self.replaceline,"\ n", text) text= Re.sub (self.replacetd,"\ t", text) text= Re.sub (Self.replacepara,"\ n"+" ", text) text= Re.sub (Self.replacebr,"\ n", text) text= Re.sub (Self.removeextratag,"", text)returnText.strip ()classWyxw:def __init__(Self,baseurl): Self.baseurl=BASEURL self.user_agent='mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'self.headers= {'user-agent': Self.user_agent}#self.file = NoneSelf.filename = u'NetEase News'Self.tool=Tool ()defget_homepage (self): URL=Self.baseurl Request= Urllib2. Request (url,headers =self.headers) Response=Urllib2.urlopen (Request) Content= Response.read (). Decode ('Utf-8','Ignore') #print Content#.encode (' gbk ', ' ignore ') returncontentdefExtract_url (self,homepage): Pattern="http://news.163.com/\d{2}/\d{4}/\d{2}/\w{16}.html"News_url=Re.findall (pattern,homepage)#Print News_url returnNews_urldefExtract_sub_web_time (self,sub_web): Pattern= Re.compile (r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', Re. S) Time=Re.findall (pattern,sub_web)PrintTime[0]returnTime[0]defExtract_sub_web_source (self,sub_web): Pattern= Re.compile (r'<a id= "Ne_article_source" .*?> (. *?) </a>') Source=Re.findall (pattern,sub_web)PrintSource[0]returnSource[0]defExtract_sub_web_title (self,sub_web):#pattern = "<title>.+</title>" #pattern = ' Pattern = Re.compile (r'
', Re. S) Title=Re.findall (pattern,sub_web)ifTitle is notNone:PrintTitle[0]returnTitle[0]Else: returnNonedefextract_sub_web_content (self,sub_web):#pattern = "<div id=\" cnt-main-article-qq\ ".*</div> "Pattern = Re.compile (r'<div id= "Endtext" .*?> (. *?) <!. *?-->', Re. S) Content=Re.findall (pattern,sub_web)#Print Content[0] ifContent is notNone:returnContent[0]Else: returnNonedefWriteData (self,fname):ifFName is notNone:file= Open (FName +'. txt',"w+") Else: File= Open (Self.filename +'. txt',"w+") Homepage=self.get_homepage () news_urls=Self.extract_url (Homepage) forUrlinchNews_urls:PrintURL Web=urllib2.urlopen (URL). Read () title=self.extract_sub_web_title (web). Strip () content=self.extract_sub_web_content (web) time=self.extract_sub_web_time (web). Strip () source=Self.extract_sub_web_source (web). Strip ()ifContent is notnone:content=self.tool.replace (content) News= title +"\ n"+ Time +"\ t"+ Source +"\ n"+ content +"\ n"File.write (News) Sep="\ n"+"-------------------------------------------------------------------------"+"\ n"file.write (Sep)PrintU"News Write Success"+"\ n"BASEURL="http://news.163.com"Wyxw=Wyxw (BASEURL) wyxw.writedata (None)
Python crawler Combat (iii): Climb NetEase News