Scrapy uses PhantomJS to crawl data. scrapyphantomjs
Environment: python2.7 + scrapy + selenium + PhantomJS
Content: Test scrapy + PhantomJS
Crawler content: js loading more pages
Principle: Open the configuration file middleware + modify the process_request function (add the PhantomJS operation in it)
Step 1:
Settings. py
DOWNLOADER_MIDDLEWARES = { 'dbdm.middlewares.DbdmSpiderMiddleware': 543,}
Different project names are not affected.
Step 2:
---------- PhantomJS is enabled by default.
Middlewares. py
Selenium needs to be loaded above
From selenium import webdriver
# ...... Omitting some code
@ Classmethod def process_request (cls, request, spider): # if request. meta. has_key ('phantomjs'): driver = webdriver. phantomJS ('e: \ p_python \ Scripts \ phantomjs \ bin \ phantomjs.exe ') driver. get (request. url) if request. url = 'https: // movie.douban.com/tag': driver. find_element_by_xpath ('// * [@ id = "app"]/div [1]/div [1]/ul [1]/li [5]/span '). click () time. sleep (5) if driver. find_element_by_xpath ('// * [@ id = "app"]/div [1]/A'): click_more (driver) content = driver. page_source.encode ('utf-8') # print content # file = open (path. join (d, '1.txt '), 'w') # file. write (content) # file. close () driver. quit () return HtmlResponse (request. url, encoding = 'utf-8', body = content, request = request)
def click_more(driver,i=1): driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a').click() print str(i)+' click' time.sleep(5) i = i+1 try: more_btn = driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a') if more_btn: click_more(driver,i) except: print 'click Over!!'
The above is only the code for testing. The code is changed according to your own project. Currently, the PhantomJS access url is opened by default, which can be determined by reference.
----------- Enable it when necessary
Judge the key value
Selenium needs to be loaded above
From selenium import webdriver
# ...... Omitting some code
@classmethod def process_request(cls, request, spider): if request.meta.has_key('PhantomJS'): driver = webdriver.PhantomJS('E:\\p_python\\Scripts\\phantomjs\\bin\\phantomjs.exe') driver.get(request.url) content = driver.page_source.encode('utf-8') driver.quit() return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
The key value is set in the spider file.
Import scrapyfrom scrapy. linkextractors import LinkExtractorfrom scrapy. spiders import crawler, Rulefrom phantomjs_test.items import PhantomscrapyItemclass PhantomjsTestSpider (crawler): name = 'phantomjs _ test' allowed_domains = ['book. com '] start_urls = ['HTTP: // book.com/'] # all_urls = [] deduplication does not seem to require rules = (### obtain all the page list Rule (LinkExtractor (allow = R'/story/p/[2- 9] * '), callback = 'parse', follow = True ), ### obtain all details pages # Rule (LinkExtractor (allow = R'/detail/p/[2-9] * '), callback = 'parse _ item ', follow = True),) ### retrieve all the article URLs from the page def parse (self, response): url_list = response. xpath ('/a/@ href '). extract () for url in url_list: request = Request (url = url, callback = self. parse_item, dont_filter = True) request. meta ['phantomjs'] = True yield request def parse_item (self, response): item = PhantomscrapyItem () # I ['domain _ id'] = response. xpath ('// input [@ id = "sid"]/@ value '). extract () # I ['name'] = response. xpath ('// div [@ id = "name"]'). extract () # I ['description'] = response. xpath ('// div [@ id = "description"]'). extract () item ['bookname'] = response. xpath () items = [] items. append (item) return items
The above is the difference between opening by default and re-opening by judgment conditions. You can set it based on different pages, and the code still needs to be improved to be user-friendly.