This article mainly introduced the Python regular crawl NetEase news method, combined with the example form more detailed analysis Python uses the regular to carry on the NetEase news grasping operation the related realization skill and the attention matter, needs the friend to consult under
In this paper, we describe the method for Python to crawl NetEase news. Share to everyone for your reference, as follows:
Wrote some of their own crawling NetEase news crawler, found that its web page source code and Web page comments are not on, so, the use of the capture tool to get its comments hidden address (each browser has its own capture tool, can be used to analyze the site)
If you look closely, you'll find that there's a special, and that's what you want.
Then open the link to find the relevant comments. (for first page content)
The next step is the code (also written as rewritten by the great God).
#coding =utf-8import urllib2import reimport jsonimport timeclass WY (): Def __init__ (self): Self.headers = {' User-agent ' : ' mozilla/5.0 (Windows NT 5.1) applewebkit/534.24 (khtml, like '} self.url= ' Http://comment.news.163.com/data/news3_bbs /df/b9ibdheh000146be_1.html ' Def getpage (self,page): Full_url= ' http://comment.news.163.com/cache/newlist/news3_ Bbs/b9ibdheh000146be_ ' +str (page) + '. html ' return Full_url def gethtml (self,page): Try:req=urllib2. Request (page,none,self.headers) response = Urllib2.urlopen (req) HTML = response.read () return HTML EXCEP T urllib2. Urlerror,e:if hasattr (E, ' reason '): Print U "Connection Failed", E.reason return None #处理字符串 def Process (self,data,p Age): if page = = 1:data=data.replace (' var replydata= ', ') else:data=data.replace (' var newpostlist= ', ') Reg1=re.compile ("\[<a href= >") data=reg1.sub (", data) Reg2=re.compile (' <\\\/a>\] ') data=reg2.s UB (', Data ') Reg3=re.compile (' ≪br> ') data=reg3.sub (", data) return Data #解析json def dealjson (self): with open (" WY.txt "," a ") as file: File.write (' ID ' + ' | ') + ' comments ' + ' | ' + ' Step ' + ' | ' + ' top ' + ' \ n ') for I in Range (1,12): if i = = 1:data=self.gethtml (self.url) data=self. Process (Data,i) [: -1] value=json.loads (data) file=open (' WY.txt ', ' a ') for item in value[' Hotposts ']: Try:file.write (item[' 1 ' [' F '].encode (' utf-8 ') + ' | ') File.write (item[' 1 ' [' B '].encode (' utf-8 ') + ' | ') File.write (item[' 1 ' [' A '].encode (' utf-8 ') + ' | ') File.write (item[' 1 '] [' V '].encode (' utf-8 ') + ' \ n ') except:continue file.close () print '-- In acquisition%d/12--'%i time.sleep (5) else:page=self.getpage (i) data = self.gethtml (page) data = s Elf. Process (Data,i) [: -2] # Print data value=json.loads (data) # Print value File=open (' WY.txt ', ' a ') For item in value[' Newposts ']: Try:file.write (item[' 1 ' [' F '].encode (' utf-8 ') + ' | ') File.write (item[' 1 ' [' B '].encode (' utf-8 ') + ' | ') File.write (item[' 1 ' [' A '].encode (' utf-8 ') + ' | ') File.write (item[' 1 '] [' V '].encode (' utf-8 ') + ' \ n ') except:continue file.close () print '-- In the acquisition%d/12--'%i time.sleep (5) if __name__ = = ' __main__ ': WY (). Dealjson ()