Copy Code code as follows:
From selenium import Selenium
Class Myspider (Crawlspider):
name = ' Cnbeta '
Allowed_domains = [' cnbeta.com ']
Start_urls = [' http://www.jb51.net ']
Rules = (
# Extract links matching ' category.php ' (but not matching ' subsection.php ')
# and follow links from them (since no callback means by default).
Rule (sgmllinkextractor allow= ('/articles/.*\.htm ',)),
callback= ' Parse_page ', follow=true),
# Extract links matching ' item.php ' and parse them with the spider ' s method parse_item
)
def __init__ (self):
Crawlspider.__init__ (self)
Self.verificationerrors = []
Self.selenium = Selenium ("localhost", 4444, "*firefox", "http://www.jb51.net")
Self.selenium.start ()
def __del__ (self):
Self.selenium.stop ()
Print Self.verificationerrors
Crawlspider.__del__ (self)
def parse_page (self, Response):
Self.log (' Hi, this is a item page!%s '% response.url)
SEL = Selector (response)
From Webproxy.items import Webproxyitem
sel = Self.selenium
Sel.open (Response.url)
Sel.wait_for_page_to_load ("30000")
Import time
Time.sleep (2.5)