Write tasks.py
Copy Code code as follows:
From celery import celery
From tornado.httpclient import httpclient
App = celery (' tasks ')
App.config_from_object (' Celeryconfig ')
@app. Task
def get_html (URL):
Http_client = HttpClient ()
Try
Response = Http_client.fetch (url,follow_redirects=true)
Return Response.body
Except HttpClient. Httperror as E:
Return None
Http_client.close ()
Write celeryconfig.py
Copy Code code as follows:
Celery_imports = (' tasks ',)
Broker_url = ' amqp://guest@localhost:5672//'
Celery_result_backend = ' amqp://'
Write spider.py
Copy Code code as follows:
From tasks Import get_html
From queue import queue
From BS4 import BeautifulSoup
From Urllib.parse import Urlparse,urljoin
Import threading
Class Spider (object):
def __init__ (self):
self.visited={}
Self.queue=queue ()
def process_html (self, HTML):
Pass
#print (HTML)
def _add_links_to_queue (self,url_base,html):
Soup = beautifulsoup (HTML)
Links=soup.find_all (' a ')
For link in Links:
Try
url=link[' href ']
Except
Pass
Else
Url_com=urlparse (URL)
If not url_com.netloc:
Self.queue.put (Urljoin (Url_base,url))
Else
Self.queue.put (Url_com.geturl ())
def start (Self,url):
Self.queue.put (URL)
For I in range (20):
t = Threading. Thread (Target=self._worker)
T.daemon = True
T.start ()
Self.queue.join ()
def _worker (self):
While 1:
Url=self.queue.get ()
If URL in self.visited:
Continue
Else
Result=get_html.delay (URL)
Try
Html=result.get (timeout=5)
Except Exception as E:
Print (URL)
Print (e)
self.process_html (HTML)
Self._add_links_to_queue (url,html)
Self.visited[url]=true
Self.queue.task_done ()
S=spider ()
S.start ("http://www.jb51.net/")
Due to the existence of some special cases in HTML, the program still needs to be perfected.