Compile tasks. py
Copy codeThe Code is as follows:
From celery import Celery
From tornado. httpclient import HTTPClient
App = Celery ('task ')
App. config_from_object ('celeryconfig ')
@ App. task
Def get_html (url ):
Http_client = HTTPClient ()
Try:
Response = http_client.fetch (url, follow_redirects = True)
Return response. body
Failed t httpclient. HTTPError as e:
Return None
Http_client.close ()
Compile celeryconfig. py
Copy codeThe Code is as follows:
CELERY_IMPORTS = ('tasks ',)
BROKER_URL = 'amqp: // guest @ localhost: 5672 //'
CELERY_RESULT_BACKEND = 'amqp ://'
Write spider. py
Copy codeThe Code is as follows:
From tasks import get_html
From queue import Queue
From bs4 import BeautifulSoup
From urllib. parse import urlparse, urljoin
Import threading
Class spider (object ):
Def _ init _ (self ):
Self. visited = {}
Self. queue = Queue ()
Def process_html (self, html ):
Pass
# Print (html)
Def _ add_links_to_queue (self, url_base, html ):
Soup = BeautifulSoup (html)
Links = soup. find_all ('A ')
For link in links:
Try:
Url = link ['href ']
Except t:
Pass
Else:
Url_com = urlparse (url)
If not url_com.netloc:
Self. queue. put (urljoin (url_base, url ))
Else:
Self. queue. put (url_com.geturl ())
Def start (self, url ):
Self. queue. put (url)
For I in range (20 ):
T = threading. Thread (target = self. _ worker)
T. daemon = True
T. start ()
Self. queue. join ()
Def _ worker (self ):
While 1:
Url = self. queue. get ()
If url in self. visited:
Continue
Else:
Result = get_html.delay (url)
Try:
Html = result. get (timeout = 5)
Failed t Exception as e:
Print (url)
Print (e)
Self. process_html (html)
Self. _ add_links_to_queue (url, html)
Self. visited [url] = True
Self. queue. task_done ()
S = spider ()
S. start ("http://www.jb51.net /")
Due to some special situations in html, the program remains to be improved.