Creates a new file crawlertask.py, which is used to perform data crawling tasks, as follows.
#coding: utf-8 from celery import celery,platforms import requests from BS4 import BeautifulSoup app=celery (' tasks ', broke R= ' redis://localhost:6379/0 ') app.conf.celery_result_backend= ' redis://localhost:6379/0 ' platforms. C_force_root=true def format_str (str): return str.replace ("\ n", ""). Replace ("", ""). Replace ("T", "") @app. Task Def G Et_urls_in_pages (From_page_num,to_page_num): urls=[] search_word= ' computer ' url_part_1= ' Http://www.phei.com.cn/modu Le/goods/' searchkey.jsp? Page= ' url_part_2= ' &page=2&searchkey= ' for I in Range (from_page_num,to_page_num+1): Urls.append (U
Rl_part_1+str (i) +url_part_2+search_word) all_href_list=[] for URL in urls:resp=requests.get (URL)
Bs=beautifulsoup (Resp.text) a_list=bs.find_all (' a ') needed_list=[] for a in a_list: If ' href ' in a.attrs:href_val=a[' href '] title=a.text if ' bookid ' in href_ Val and' shopcar0.jsp ' not in Href_val and title!= ': if [title,href_val] not in needed_list: Needed_list.append ([Format_str (title), Format_str (Href_val)]) all_href_list+=needed_list All_href_file = Open (str (from_page_num) + ' _ ' +str (to_page_num) + ' _ ' + ' All_hrefs.txt ', ' W ') for href in all_href_list:all_href_ File.write (' \ t '. Join (HREF) + ' \ n ') all_href_file.close () return Len (all_href_list)
Deploy the above script to two cloud servers.
and open the Redis service in the Cloud and execute:
Celery worker-a crawlertask-l info-c 10
To create a new file on this computer task_dist.py is used for asynchronous distribution tasks as follows:
From celery import celery to threading import Thread import time redis_ips={0: ' redis://101.200.163.195:6379/0 ', 1: ' redis://112.124.28.41:6379/0 ', 2: ' redis://112.124.28.41:6379/0 ', 3: ' redis://101.200.163.195:6379/0 ',} def s End_task_and_get_results (ind,from_page,to_page): App=celery (' Crawlertask ', Broker=redis_ips[ind]) App.conf.CELERY
_result_backend=redis_ips[ind] Result=app.send_task (' Crawlertask.get_urls_in_pages ', args= (from_page,to_page))
Print (Redis_ips[ind],result.get ()) if __name__== ' __main__ ': T1=time.time () page_ranges_lst=[(1,10), (11,20), (21,30), (31,40),] th_lst = [] for IND, Page_range in enumerate (Page_ranges_ls T): th = Thread (Target=send_task_and_get_results, args= (ind,page_range[0), Page_rang
E[1]) th_lst.append (th) for th in Th_lst:th.start () for th in Th_lst:th.join () T2 = Time.time () print ("Spents"): ", T2-T1)