#My Code如下,問題在代碼的注釋裡面
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle #用於定義需要提取的連結
class NewsSpider(CrawlSpider):
name = "demo2"allowed_domains = ["360.cn"]# start_urls=["http://bobao.360.cn/activity/index&page=2"]#這個串連測試成功# rules = [ Rule(sle(allow=r'/vul/index?type=all&page=\d{1,3}'), follow=True,callback='parse_item1') ]#*這個測試不成功原因好像*#allow=r'/vul/index?type=all&page=\d{1,3} 這個裡面/vul/index?這個?有影響,請教如何修改rules = [ Rule(sle(allow=r'/activity/index&page=\d{1,3}'), follow=True,callback='parse_item1') ]#這個串連測試成功
/activity/index&page=
def parse_item1(self, response): print u'這是誰?????????????????????'
回複內容:
#My Code如下,問題在代碼的注釋裡面
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle #用於定義需要提取的連結
class NewsSpider(CrawlSpider):
name = "demo2"allowed_domains = ["360.cn"]# start_urls=["http://bobao.360.cn/activity/index&page=2"]#這個串連測試成功# rules = [ Rule(sle(allow=r'/vul/index?type=all&page=\d{1,3}'), follow=True,callback='parse_item1') ]#*這個測試不成功原因好像*#allow=r'/vul/index?type=all&page=\d{1,3} 這個裡面/vul/index?這個?有影響,請教如何修改rules = [ Rule(sle(allow=r'/activity/index&page=\d{1,3}'), follow=True,callback='parse_item1') ]#這個串連測試成功
/activity/index&page=
def parse_item1(self, response): print u'這是誰?????????????????????'
此處要注意?號的轉換,複製過來需要對?號進行轉義。
網頁中連結是這樣:#/rwxwsblog/default.html?page=3"
要寫成這樣的:Rule(sle(allow=("/rwxwsblog/default.html\?page=\d{1,}")), #此處要注意?號的轉換,複製過來需要對?號進行轉義。