This article describes how to use scrapy to parse js in python.
The code is as follows:
From selenium import selenium
Class MySpider (crawler ):
Name = 'cnbeta'
Allowed_domains = ['cnbeta. com']
Start_urls = ['http: // www.jb51.net']
Rules = (
# Extract links matching 'Category. php' (but not matching 'subsection. php ')
# And follow links from them (since no callback means follow = True by default ).
Rule (SgmlLinkExtractor (allow = ('/articles/. * \. htm ',)),
Callback = 'parse _ page', follow = True ),
# Extract links matching 'item. php' and parse them with the spider's method parse_item
)
Def _ init _ (self ):
CrawlSpider. _ init _ (self)
Self. verificationErrors = []
Self. selenium = selenium ("localhost", 4444, "* firefox", "http://www.jb51.net ")
Self. selenium. start ()
Def _ del _ (self ):
Self. selenium. stop ()
Print self. verificationErrors
CrawlSpider. _ del _ (self)
Def parse_page (self, response ):
Self. log ('Hi, this is an item page! % S' % response. url)
Sel = Selector (response)
From webproxy. items import WebproxyItem
Sel = self. selenium
Sel. open (response. url)
Sel. wait_for_page_to_load ("30000 ")
Import time
Time. sleep (2.5)