Use Re, urllib, threading Multi-threaded crawl Tianya post content, set the URL to crawl the first page of the Tianya post, set file_name to download the file name
The code is as follows:
#coding: Utf-8
Import Urllib
Import re
Import threading
Import OS, time
Class Down_tianya (threading. Thread):
"" "Multi-Threaded Download" ""
def __init__ (self, url, num, dt):
Threading. Thread.__init__ (self)
Self.url = URL
Self.num = num
self.txt_dict = DT
def run (self):
print ' downling from%s '% Self.url
Self.down_text ()
def down_text (self):
"" "in accordance with the incoming URL to capture the contents of each page, by the number of pages to deposit the dictionary" ""
Html_content =urllib.urlopen (Self.url). Read ()
Text_pattern = Re.compile (' time: (. *?). *? . *?\s* (. *?) ', Re. Dotall)
Text = Text_pattern.findall (html_content)
Text_join = [' \r\n\r\n\r\n\r\n '. Join (item) for item in text]
Self.txt_dict[self.num] = Text_join
def page (URL):
"" To fetch the total number of pages according to the first page address ""
html_page = Urllib.urlopen (URL). Read ()
Page_pattern = Re.compile (R ' (\d*) \s* next page ')
Page_result = Page_pattern.search (html_page)
If Page_result:
page_num = Int (Page_result.group (1))
Return Page_num
def write_text (Dict, FN):
"" To write the dictionary content key (number of pages) to the text, each key value is a list of content per page "" "
Tx_file = open (FN, ' w+ ')
PN = Len (dict)
For I in range (1, pn+1):
Tx_list = Dict[i]
For TX in Tx_list:
tx = Tx.replace ('
', ' \ r \ n '). Replace ('
', ' \ r \ n '). Replace (', ')
Tx_file.write (Tx.strip () + ' \ r \ n ')
Tx_file.close ()
def main ():
url = ' http://bbs.tianya.cn/post-16-996521-1.shtml '
file_name = ' Abc.txt '
my_page = page (URL)
My_dict = {}
print ' Page num is:%s '% my_page
Threads = []
"" "based on the number of pages constructed URLs for multi-threaded download" ""
For NUM in range (1, my_page+1):
Myurl = '%s%s.shtml '% (url[:-7], num)
Downlist = Down_tianya (Myurl, num, my_dict)
Downlist.start ()
Threads.append (downlist)
"" "Check the download is complete before writing" ""
For T in Threads:
T.join ()
Write_text (My_dict, file_name)
print ' all download finished. Save file at directory:%s '% OS.GETCWD ()
if __name__ = = ' __main__ ':
Main ()
down_tianya.py
The code is as follows:
#coding: Utf-8
Import Urllib
Import re
Import threading
Import OS
Class Down_tianya (threading. Thread):
"" "Multi-Threaded Download" ""
def __init__ (self, url, num, dt):
Threading. Thread.__init__ (self)
Self.url = URL
Self.num = num
self.txt_dict = DT
def run (self):
print ' downling from%s '% Self.url
Self.down_text ()
def down_text (self):
"" "in accordance with the incoming URL to capture the contents of each page, by the number of pages to deposit the dictionary" ""
Html_content =urllib.urlopen (Self.url). Read ()
Text_pattern = Re.compile (' Time: (. *?). *? . *?\s* (. *?) ', Re. Dotall)
Text = Text_pattern.findall (html_content)
Text_join = [' \r\n\r\n\r\n\r\n '. Join (item) for item in text]
Self.txt_dict[self.num] = Text_join
def page (URL):
"" To fetch the total number of pages according to the first page address ""
html_page = Urllib.urlopen (URL). Read ()
Page_pattern = Re.compile (R ' (\d*) \s* next page ')
Page_result = Page_pattern.search (html_page)
If Page_result:
page_num = Int (Page_result.group (1))
Return Page_num
def write_text (Dict, FN):
"" To write the dictionary content key (number of pages) to the text, each key value is a list of content per page "" "
Tx_file = open (FN, ' w+ ')
PN = Len (dict)
For I in range (1, pn+1):
Tx_list = Dict[i]
For TX in Tx_list:
tx = Tx.replace ('
', ' \ r \ n '). Replace ('
', ' \ r \ n '). Replace (', ')
Tx_file.write (Tx.strip () + ' \ r \ n ')
Tx_file.close ()
def main ():
url = ' http://bbs.tianya.cn/post-16-996521-1.shtml '
file_name = ' Abc.txt '
my_page = page (URL)
My_dict = {}
print ' Page num is:%s '% my_page
Threads = []
"" "based on the number of pages constructed URLs for multi-threaded download" ""
For NUM in range (1, my_page+1):
Myurl = '%s%s.shtml '% (url[:-7], num)
Downlist = Down_tianya (Myurl, num, my_dict)
Downlist.start ()
Threads.append (downlist)
"" "Check the download is complete before writing" ""
For T in Threads:
T.join ()
Write_text (My_dict, file_name)
print ' all download finished. Save file at directory:%s '% OS.GETCWD ()
if __name__ = = ' __main__ ':
Main ()