Import requests
Import JSON
From retrying import retry
From lxml import etree
From queue import queue
Import threading
Class Qiushi:
def __init__ (self):
# define three queues
Self.url_queue = Queue ()
Self.html_queue = Queue ()
Self.content_list_queue = Queue ()
Self.headers = {
"User-agent": "mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) applewebkit/537.36 (khtml, like Gecko) chrome/61.0.3163.100 safari/537.36 "
}
def get_url_list (self):
Url_list = [' https://www.qiushibaike.com/8hr/page/{}/'. Format (i) for I in range (1, 14)]
For URL in url_list:
# put the URL into the queue using the Put method
Self.url_queue.put (URL)
@retry (stop_max_attempt_number=3)
def _parse_url (self, URL):
Response = Requests.get (URL, headers=self.headers, timeout=3)
Assert Response.status_code = = 200
Return etree. HTML (response.content)
def parse_url (self):
# A URL is required here.
# because URL = self.url_queue.get () only takes the URL one time from the queue. So here we need to add a while true loop to fetch.
# when there is no URL in the URL queue, there will be blocked waiting, as long as there is to take.
# But the cardinality of the queue is not minus 1 (not subtracting the URL you just picked up), so use Task_done () below.
While True:
url = self.url_queue.get ()
Print (URL)
Try
html = self._parse_url (URL)
Except
HTML = None
# Add HTML to the queue
Self.html_queue.put (HTML)
Self.url_queue.task_done ()
def get_content_list (self):
# Just like the top
While True:
html = Self.html_queue.get ()
If HTML is not None:
Div_list = Html.xpath ('//div[@id = ' content-left ']/div ')
Content_list = []
For Div in div_list:
item = {}
item[' name '] = Div.xpath ('.//h2/text () ') [0].replace ("\ n", "") If Len (Div.xpath ('.//h2/text () ')) > 0 Else None
item[' content ' = Div.xpath ('.//div[@class = "Content"]/span/text () ') [0].replace ("\ n", "") If Len (Div.xpath ('.//div[@ class= "Content"]/span/text ())) > 0 Else None
item[' comment ' = Div.xpath ('.//i[@class = "number"]/text () ') [1] If Len (Div.xpath ('.//i[@class = "number"]/text () ')) > 0 Else None
Item[' img '] = Div.xpath ('.//img/@src ') If Len (Div.xpath ('.//img/@src ')) > 0 Else None
Content_list.append (item)
Self.content_list_queue.put (Content_list)
Self.html_queue.task_done ()
def save_content_list (self):
While True:
Content_list = Self.content_list_queue.get ()
With open ("Qiubai.json", "a", encoding= "Utf-8") as F:
For content in Content_list:
Json.dump (content, F, Ensure_ascii=false, indent=2)
F.write (', \ n ')
Self.content_list_queue.task_done ()
def run (self):
Thread_list = []
# Create a thread that extracts the URL
T_url = Threading. Thread (Target=self.get_url_list)
Thread_list.append (T_url)
# because sending requests is time consuming, here we use multithreading to do
For I in range (5):
T_parse = Threading. Thread (Target=self.parse_url)
Thread_list.append (T_parse)
# Extracting data is also time consuming, where we also use multithreading
For I in range (3):
T_get_content_list = Threading. Thread (Target=self.get_content_list)
Thread_list.append (T_get_content_list)
# data must be stored with one thread
T_save = Threading. Thread (Target=self.save_content_list)
Thread_list.append (T_save)
For T in Thread_list:
T.setdaemon (True) # daemon Thread
T.start ()
# when there is no data in all the queues, the cardinality is equal to 0 when the main thread ends. Otherwise stuck in the Q.join ()
For q in [Self.content_list_queue, Self.html_queue, Self.url_queue]:
Q.join ()
if __name__ = = ' __main__ ':
Qiubai = Qiushi ()
Qiubai.run ()
Python3 multithreading to get data instances