Multithreading:
Import threadingfrom multiprocessing Import queuefrom time import sleepfrom BS4 import beautifulsoupfrom requests Import g Etimport Reclass MyThread (threading. Thread): Def __init__ (self, Qlock, queue): Threading. Thread.__init__ (self) self.qlock = Qlock self.queue = Queue def run (self): Process (Self.qlock, SEL F.QUEUE) def process (Qlock, queue): Qlock.acquire () # Mutex try:data = Queue.get () # Get queue print (data) Finally:qlock.release () # Release lock Sleep (1) # build Queue Workqueue = Queue (Qlock) = Threading. Lock () url = ' https://www.pixiv.net/ranking.php?mode=daily ' r = Get (URL, timeout=1) html = R.textsoup = BeautifulSoup (html , ' lxml ') urls = Soup.find_all (' img ') links = []for URL in urls:r = Re.compile (R ' data-src= "(. +?)") link = r.findall (str (URL)) workqueue.put (link) # Write queue links.append (link) threads = []for URL in links:thread = M Ythread (Qlock, workQueue) Thread.daemon = True Thread.Start () threads.append (thread) #Empty queue while not Workqueue.empty (): pass# waiting for thread to end for T in Threads:t.join ()
Multi-process:
1. Create a process pool using the Pool module:
From multiprocessing import poolfrom BS4 import beautifulsoupfrom requests import Getimport reimport osdef run_process (url ): print (URL) if __name__ = = ' __main__ ': url = ' https://www.pixiv.net/ranking.php?mode=daily ' html = Get ( URL, timeout=1). Text soup = beautifulsoup (html, ' lxml ') urls = Soup.find_all (' img ') links = [] for u in U RLS: r = re.compile (R ' data-src= "(. +?). JPG) link = r.findall (str (u)) links.append (link) process = Pool (Os.cpu_count ()) # Number of CPU cores for u in Links: Process.apply_async (run_process,args= (u)) process.close () process.join ()
2.Process module, queue module for interprocess communication (but my write queue is not multi-process):
From multiprocessing import Process, Queuefrom BS4 import beautifulsoupfrom requests import Getimport Reclass myprocess (Pr ocess): def __init__ (self, queue): process.__init__ (self) self.queue = Queue def run (self): run _process (Self.queue) def run_process (queue): data = Queue.get () print (data) if __name__ = = ' __main__ ': URL = ' https://www.pixiv.net/ranking.php?mode=daily ' html = Get (URL, timeout=1). Text soup = beautifulsoup (HTML, ' lxml ') urls = Soup.find_all (' img ') queue = Queue (+) links = [] for u in URLs: r = re.compile (R ' Data-src= "(. +?. JPG) link = r.findall (str (u)) queue.put (link) links.append (link) for u in Links: Process = myprocess (queue) Process.Start () and not Queue.empty (): pass Process.join ()
The 2nd one is significantly slower than the 1th one and does not know why ...
But the above is CPU-intensive, test the IO-intensive small reptile to see the effect:
1. Multithreading:
Import threadingfrom multiprocessing Import queuefrom time import sleepfrom BS4 import beautifulsoupfrom requests Import g Etimport Reclass MyThread (threading. Thread): Def __init__ (self, Qlock, queue): Threading. Thread.__init__ (self) self.qlock = Qlock self.queue = Queue def run (self): Process (Self.qlock, SEL F.QUEUE) def process (Qlock, queue): Qlock.acquire () # Mutex try:url = Queue.get () [0] # Get queue img = Get (ur l,timeout=1). Content name = Url.split ('/') [-1] imgid = Name[:8] with open (' C:/users/adimin/desktop/vi Deo/{}.jpg '. Format (imgid), ' WB ') as Fp:fp.write (IMG) print (' Download: ' + URL) finally:qlock . Release () # Sleep (1) # build Queue Workqueue = Queue (Qlock = threading). Lock () url = ' https://www.pixiv.net/ranking.php?mode=daily ' html = Get (URL, timeout=1). Textsoup = BeautifulSoup (HTML, ' lxml ') urls = Soup.find_all (' img ') links = []for u in urls:r = Re.compile (R ' data-src= "(. +?). JPG) "') LiNK = R.findall (str (u)) workqueue.put (link) # Write queue links.append (link) threads = []for u in links:thread = Mythrea D (Qlock, WorkQueue) Thread.Start () threads.append (thread) # empty queue while not Workqueue.empty (): pass# wait for thread to end for T in Threads:t.join ()
2. Multi-process:
From multiprocessing import Process, Queuefrom BS4 import beautifulsoupfrom requests import Getimport Reclass myprocess (Pr ocess): Def __init__ (self, queue): process.__init__ (self) self.queue = Queue def run (self): Run _process (Self.queue) def run_process (queue): url = queue.get () [0] # Get queue img = Get (URL, timeout=1). Content name = Url.split ('/') [-1] imgid = Name[:8] with open (' c:/users/adimin/desktop/video/{}.jpg '. Format (imgid), ' WB ') as FP: Fp.write (IMG) print (' Download: ' + URL) if __name__ = = ' __main__ ': url = ' Https://www.pixiv.net/ranking.php?mode =daily ' html = Get (URL, timeout=1). Text soup = beautifulsoup (html, ' lxml ') urls = Soup.find_all (' img ') queue = Queue (+) links = [] for u in urls:r = Re.compile (R ' data-src= "(. +?). JPG) link = r.findall (str (u)) queue.put (link) links.append (link) for u in links:process = myprocess (queue) Process.Start () and not Queue.emPty (): Pass Process.join ()
Finally, the feeling of running time is almost ... Or not to see the gap.
Python-multi-threaded/multi-process