Use re, urllib, and threading to capture the content of Tianya post, set the url to the first page of the Tianya post to be crawled, and set file_name to the downloaded file name.
Copy codeThe Code is as follows:
# Coding: UTF-8
Import urllib
Import re
Import threading
Import OS, time
Class Down_Tianya (threading. Thread ):
"Multi-thread download """
Def _ init _ (self, url, num, dt ):
Threading. Thread. _ init _ (self)
Self. url = url
Self. num = num
Self.txt _ dict = dt
Def run (self ):
Print 'downling from % s' % self. url
Self. down_text ()
Def down_text (self ):
"Captures the content of each page based on the input url, and stores the content in the dictionary based on the page number """
Html_content = urllib. urlopen (self. url). read ()
Text_pattern = re. compile ('<span> time :(.*?) </Span> .*? <! -- <Div class = "host-ico"> main poster </div> --> .*? <Div class = "bbs-content. *?> \ S *(.*?) </Div> ', re. DOTALL)
Text = text_pattern.findall (html_content)
Text_join = ['\ r \ n \ r \ n'. join (item) for item in text]
Self.txt _ dict [self. num] = text_join
Def page (url ):
"Capture the total number of pages based on the first page address """
Html_page = urllib. urlopen (url). read ()
Page_pattern = re. compile (R' <a href = "\ S *? "> (\ D *) </a> \ s * <a href =" \ S *? "Class =" \ S *? "> Next page </a> ')
Page_result = page_pattern.search (html_page)
If page_result:
Page_num = int (page_result.group (1 ))
Return page_num
Def write_text (dict, fn ):
"Press the dictionary content button (page number) to write the text, and each key value is the list of the content on each page """
Tx_file = open (fn, 'W + ')
Pn = len (dict)
For I in range (1, pn + 1 ):
Tx_list = dict [I]
For tx in tx_list:
Tx = tx. replace ('<br>', '\ r \ n '). replace ('<br/>', '\ r \ n '). replace ('','')
Tx_file.write (tx. strip () + '\ r \ n' * 4)
Tx_file.close ()
Def main ():
Url = 'HTTP: // bbs.tianya.cn/post-16-996521-1.shtml'
File_name using 'abc.txt'
My_page = page (url)
My_dict = {}
Print 'page num is: % s' % my_page
Threads = []
"Create a URL based on the number of pages for multi-thread download """
For num in range (1, my_page + 1 ):
Myurl = '{s}s.shtml '% (url [:-7], num)
Downlist = Down_Tianya (myurl, num, my_dict)
Downlist. start ()
Threads. append (downlist)
"Check that the download is complete before writing """
For t in threads:
T. join ()
Write_text (my_dict, file_name)
Print 'all download finished. Save file at directory: % s' % OS. getcwd ()
If _ name _ = '_ main __':
Main ()
Down_tianya.py
Copy codeThe Code is as follows:
# Coding: UTF-8
Import urllib
Import re
Import threading
Import OS
Class Down_Tianya (threading. Thread ):
"Multi-thread download """
Def _ init _ (self, url, num, dt ):
Threading. Thread. _ init _ (self)
Self. url = url
Self. num = num
Self.txt _ dict = dt
Def run (self ):
Print 'downling from % s' % self. url
Self. down_text ()
Def down_text (self ):
"Captures the content of each page based on the input url, and stores the content in the dictionary based on the page number """
Html_content = urllib. urlopen (self. url). read ()
Text_pattern = re. compile ('<div class = "atl-item ".*? <Span> time :(.*?) </Span> .*? <! -- <Div class = "host-ico"> main poster </div> --> .*? <Div class = "bbs-content. *?> \ S *(.*?) </Div> ', re. DOTALL)
Text = text_pattern.findall (html_content)
Text_join = ['\ r \ n \ r \ n'. join (item) for item in text]
Self.txt _ dict [self. num] = text_join
Def page (url ):
"Capture the total number of pages based on the first page address """
Html_page = urllib. urlopen (url). read ()
Page_pattern = re. compile (R' <a href = "\ S *? "> (\ D *) </a> \ s * <a href =" \ S *? "Class =" \ S *? "> Next page </a> ')
Page_result = page_pattern.search (html_page)
If page_result:
Page_num = int (page_result.group (1 ))
Return page_num
Def write_text (dict, fn ):
"Press the dictionary content button (page number) to write the text, and each key value is the list of the content on each page """
Tx_file = open (fn, 'W + ')
Pn = len (dict)
For I in range (1, pn + 1 ):
Tx_list = dict [I]
For tx in tx_list:
Tx = tx. replace ('<br>', '\ r \ n '). replace ('<br/>', '\ r \ n '). replace ('','')
Tx_file.write (tx. strip () + '\ r \ n' * 4)
Tx_file.close ()
Def main ():
Url = 'HTTP: // bbs.tianya.cn/post-16-996521-1.shtml'
File_name using 'abc.txt'
My_page = page (url)
My_dict = {}
Print 'page num is: % s' % my_page
Threads = []
"Create a URL based on the number of pages for multi-thread download """
For num in range (1, my_page + 1 ):
Myurl = '{s}s.shtml '% (url [:-7], num)
Downlist = Down_Tianya (myurl, num, my_dict)
Downlist. start ()
Threads. append (downlist)
"Check that the download is complete before writing """
For t in threads:
T. join ()
Write_text (my_dict, file_name)
Print 'all download finished. Save file at directory: % s' % OS. getcwd ()
If _ name _ = '_ main __':
Main ()