Python provides examples of Netease web crawler functions that can obtain all text information on Netease pages.
This example describes how to use Python to obtain all text information on the Netease page. We will share this with you for your reference. The details are as follows:
# Coding = UTF-8 # ----------------------------------- # program: Netease crawler # Author: ewang # Date: # language: Python 2.7 # function: Obtain text information on the Netease page and save it to the TXT file. # ------------------------------------- Import stringimport urllib2import reimport osclass WangYi_Spider: # declare the related property def _ init _ (self): # assign self to the wangyiUrl attribute. wangyiUrl = "http://www.163.com/" # used to save text information in the page self. pageinfor = [] print U' has started Netease crawler, crawling... '# initialize and load the Page and store its transcoding def wangyi (self): # Read the original information of the Page and transcode it from gbk Page = urllib2.urlopen (self. wangyiUrl ). read (). decode ('gbk') # obtain the page title = self. find_title (Page) print U' webpage name Name: '+ title # Get Text Information in the page self. save_infor (title) # search for the page title def find_title (self, page): # match <title> xxxx </title> myTitle = re. search (R' <title> (. *?) </Title> ', page, re. s) # initialize the title name. title = u'no title' # assign the title to the title if myTitle :#(. *?) This is called a group. The group starts from 1. title = myTitle. group (1) else: print u'crawler Report: Unable to load webpage title... 'Return title # Save the page information def save_infor (self, title): # load the page text information to the array self. get_infor () # create and open the local file f=open(title+'.txt ', 'W +') # Write the obtained page information to the file f. writelines (self. pageinfor) # disable open file f. close () print u'crawler Report: file '{title}'.txt '+ U' has been downloaded:' + OS. getcwd () print U' press any key to exit... 'raw_input () # obtain the page source code and store it in the array def get_infor (self): # obtain the page source code page = urllib2.urlop En (self. wangyiUrl ). read () # decodes the content gbk from the page and obtains all the text information self on the page. deal_infor (page. decode ('gbk') # obtain the required file information from the page code def deal_infor (self, page ): # obtain <em> XXX </em> text information XXX emTagItems = re. findall ("<em. *?> (\ W + ?) </Em> ", page, re. s) # obtain the text information of <span> XXXX </a> XXXX spanTagItems = re. findall ("<span> (\ W + ?) </Span> ", page, re. s) # obtain <. *> XXXX </a> text information XXXX aTagItems = re. findall ("<. *?> (\ W + ?) </A> ", page, re. s) # Add the text information obtained from the em tag to the array pageinfor for emItem in emTagItems: # encode the obtained text information using gbk self. pageinfor. append (emItem. encode ('gbk') + '\ n') # Add the text information obtained from the span tag to the array pageinfor for spanItem in spanTagItems: # encode the obtained text information with gbk self. pageinfor. append (spanItem. encode ('gbk') + '\ n') # Add the text information obtained from tag a to the array pageinfor for aItem in aTagItems: # encode the obtained text information with gbk self. pageinfor. append (aItem. encode ('gbk') + '\ n') # ------------ program entrance ------------------ print u "# ----------------------------------- # program: Netease crawler # Author: ewang # Date: # language: python 2.7 # function: Obtain the text information on the Netease page and save it to the TXT file # ---------------------------------------------- "" wangyiSpider = WangYi_Spider () wangyiSpider. wangyi ()