Beijing Alice Gynecology Hospital (http://fuke.fuke120.com/)
First, let's talk about configuration splash
1. Installing the Scrapy-splash Library with PIP
Pip Install Scrapy-splash
2. Use another artifact (Docker) now
Docker:https://www.docker.com/community-edition#/windows
3. Start Docker pull image after installing Docker
Docker Pull Scrapinghub/splash
4. Using Docker to run splash
Docker run-p 8050:8050 Scrapinghub/splash (after running everyone can go to the browser input http://192.168.99.100:8050 check docker is correct)
5settings.py Configuration
Splash_url = ' http://192.168.99.100:8050 ' (heavy in the heavy, a big pit, be sure to note that this IP is 192.168.99.100, I have been using my own IP has not run successfully) Downloader_ Middlewares = {' Scrapy_splash. Splashcookiesmiddleware ': 723, ' Scrapy_splash. Splashmiddleware ': 725, ' scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware ': 810,}spider_ Middlewares = {' Scrapy_splash. Splashdeduplicateargsmiddleware ': 100,}dupefilter_class = ' Scrapy_splash. Splashawaredupefilter ' httpcache_storage = ' scrapy_splash. Splashawarefscachestorage '
Robotstxt_obey = True (note here that some sites are true, and some sites need to change it to false)
Crawler's py file 1.py
#-*-Coding:utf-8-*-import refrom urllib.request import urlopenfrom scrapy.http import request# from Urllib.request Imp ORT requestfrom BS4 Import beautifulsoupfrom lxml import etreeimport pymongoimport scrapyfrom scrapy.selector Import HtmlX Pathselectorclient = Pymongo. Mongoclient (host= "127.0.0.1") db = client. Healthcollection = db. Healthclass # table name classificationimport Redis # import Redis Database R = Redis. Redis (host= ' 127.0.0.1 ', port=6379, db=0) II = 0class Healthcareclassspider (scrapy. Spider): name = "HealthCare" allowed_domains = ["fuke120.com"] # allowed access to the domain start_urls = ["Http://fuke.fuke1 20.com/"," # every crawl of a webpage will callback the parse method def parse (self, Response): Global II HxS = Htmlxpathselector (respon SE) HX = hxs.select ('//div[@id = "Allsort"]/div[@class = "item"]/span/a ') hx1 = Hxs.select ('//div[@id = "Allsort"] /div[@class = "Item Born"]/span/a ') # hx2 = Hxs.select ('//div[@id = "Allsort"]/div[@class = "item"]/div[@class = "I-MC"]/di v[@class = "I-MC01"]/ul[@class= "W_ul01"]/li/a ') for secitem in hx:ii+=1 url = secitem.select ("@href"). Extract () c = "http://fuke.fuke120.com" +url[0] name = Secitem.select ("text ()"). Extract () print (c) Print (name) ClassID = Collection.insert ({' Healthclass ': Name, ' pid ': None}) Healthurl = '%s,%s,%s '% (ClassID, C, ii) R.lpush (' Healthclassurl ', Healthurl) for secItem1 in Hx1:url = Secitem1.sel ECT ("@href"). Extract () C1 = "http://fuke.fuke120.com" +url[0] name1 = Secitem1.select ("text ()"). Extra CT () print (c1) print (name1) ClassID = Collection.insert ({' Healthclass ': name1, ' pid ': Non e}) Healthurl = '%s,%s,%s '% (ClassID, C1, 0) R.lpush (' Healthclassurl ', Healthurl)
2.py
#-*-Coding:utf-8-*-import refrom urllib.request import urlopenfrom urllib.request import requestfrom BS4 import Beauti Fulsoupfrom lxml import etreeimport pymongoimport scrapyfrom scrapy.selector import Htmlxpathselectorfrom Bson.objectid Import objectid# from scrapy.http import request# from urllib.request import urlopenfrom scrapy.http import request# from Hello.items Import zhaopinitem# from scrapy.spiders import Crawlspider, rule# from scrapy.linkextractors import Linkextra Ctorfrom urllib.request Import request,proxyhandlerfrom urllib.request Import build_openerclient = Pymongo. Mongoclient (host= "127.0.0.1") db = client. Health #库名dianpingcollection = db. Diseaseclass #表名classificationimport Redis #导入redis数据库r = Redis. Redis (host= ' 192.168.60.112 ', port=6379, db=0, charset= ' Utf-8 ') class Healthcareclassspider (Scrapy. Spider): name = "HealthCare1" allowed_domains = ["fuke120.com"] # allowed access to the domain dict = {} Start_urls = [] def __i Nit__ (self): a = R.lrange (' Healthclassurl ', 0,-1) for item in A:healthurl = Bytes.decode (item) arr = Healthu Rl.split (', ') healthcareClassSpider.start_urls.append (arr[1]) num = arr[2] pid = arr[0] url = arr[1] self.dict[url] = {"pid": PID, "num": num} def parse (self, response): Nameinfo = Self.dict[response.url] Pid1 = nameinfo[' pid '] pid = ObjectId (pid1) num = nameinfo[' num '] HX s = htmlxpathselector (response) HX = Hxs.select ('//div[@class = "X_con02_2"]/div[@class = "X_con02_3"]/ul/li/p/a ') For secitem in Hx:url = Secitem.select ("@href"). Extract () URL = "Http://fuke.fuke120.com" +url[0] Name = Secitem.select ("text ()"). Extract () print (URL) print (name) ClassID = Co Llection.insert ({' Diseaseclass ': Name, ' pid ': pid}) Diseaseclassurl = '%s,%s,%s '% (classid, URL, pid) R.lpush (' DiseaseclassURL ', Diseaseclassurl)
3.py
#-*-Coding:utf-8-*-import refrom urllib.request import urlopenfrom urllib.request import requestfrom BS4 import Beauti Fulsoupfrom lxml import etreeimport pymongoimport scrapyfrom scrapy_splash import Splashmiddlewarefrom scrapy.http Import Request, htmlresponsefrom scrapy_splash import splashrequestfrom scrapy.selector import Selectorfrom Scrapy.selector Import htmlxpathselectorfrom bson.objectid import objectid# from DiseaseHealth.diseaseHealth.spiders.SpiderJsDynamic Import phantomjs1# from scrapy.http import request# from Urllib.request Import urlopenfrom scrapy.http Import requestclient = Pymongo. Mongoclient (host= "127.0.0.1") db = client. Health # library Name dianpingcollection = db. Treatclass # table name classification#import Redis # import Redis Database #r = Redis. Redis (host= ' 192.168.60.112 ', port=6379, db=0, charset= ' Utf-8 ') class Healthcareclassspider (Scrapy. Spider): name = "HealthCare2" allowed_domains = ["fuke120.com"] # allowed access to the domain dict = {} Start_urls = [] def __i Nit__ (self): a =R.lrange (' Diseaseclassurl ', 0,-1) for item in A:healthurl = Bytes.decode (item) arr = Healthu Rl.split (', ') healthcareClassSpider.start_urls.append (arr[1]) num = arr[2] pid = arr[0] url = arr[1] self.dict[url] = {"pid": PID, "num": num} def start_requests (self): for URL in Self.start_urls:yield splashrequest (URL, self.parse, args={' Wait ': 0.5}) def parse (self, Response): # a = Response.body.decode (' utf-8 ') # Print (a) Nameinfo = Self.dict[response.url] Pid1 = Nam einfo[' pid '] pid = ObjectId (pid1) num = nameinfo[' num '] print (num) print (PID) HxS = Htm Lxpathselector (response) HX = Hxs.select ('//div[@class = "Dh01"]/ul[@class = "ul_bg01"]/li/a ') for Secitem in HX : url = secitem.select ("@href"). Extract () c = "http://fuke.fuke120.com" + url[0] name = S Ecitem.select ("text ()").Extract () print (c) print (name) ClassID = Collection.insert ({' Treatclass ': Name, ' pid ': pi D}) Treatclassurl = '%s,%s,%s '% (ClassID, C, PID) R.lpush (' Treatclassurl ', Treatclassurl)
The main purpose is to use Scrapy-splash.
Configure Scrapy-splash+python to crawl hospital information (using Scrapy-splash)