#!/usr/bin/env python#-*-encoding:utf-8-*-#Created on 2015-03-20 09:46:20#Project:fly_spiderImportReImport Time#From pyspider.database.mysql.mysqldb import SQL fromPyspider.libs.base_handlerImport* fromPyqueryImportPyquery as PQclassHandler (Basehandler): Headers= { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "accept-encoding":"gzip, deflate, SDCH", "Accept-language":"zh-cn,zh;q=0.8", "Cache-control":"max-age=0", "Connection":"keep-alive", "user-agent":"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/41.0.2272.101 safari/537.36"} crawl_config= { "Headers": Headers,"Timeout": 100} @every (minutes= 1) defOn_Start (self): Self.crawl ('Http://www.zhanqi.tv/games', callback=self.index_page) @config ( age=10 * 24 * 60 * 60) defindex_page (Self, Response):Print(response) foreachinchResponse.doc ('a[href^= "http://www.zhanqi.tv/games/"]'). Items ():ifRe.match ("http://www.zhanqi.tv/games/\w+", Each.attr.href, re. U): Self.crawl (each.attr.href, Fetch_type='JS', Js_script="""function () {setTimeout (Window.scrollto (0,document.body.scrollheight), 5000); } """, callback=self.list_page) @config ( age=1*60*60, priority=2) deflist_page (Self, Response): foreachinchResponse.doc ('. Active > Div.live-list-tabc > Ul#hotlist.clearfix > li > A'). Items ():ifRe.match ("http://www.zhanqi.tv/\w+", Each.attr.href, re. U): Self.crawl (each.attr.href, Fetch_type='JS', Js_script="""function () {setTimeout (Window.scrollto (0,document.body.scrollheight), 5000); } """, callback=self.detail_page) @config ( age=1*60*60, priority=2) defdetail_page (Self, Response): foreachinchResponse.doc ('. Video-flash-cont'). Items (): D=PQ (each)Print(d.html ())return { "URL": Response.url,"author": Response.doc ('. Meat > Span'). Text (),"title": Response.doc ('. Title-name'). Text (),"Game-name": Response.doc ('span >. game-name'). Text (),"users2": Response.doc ('div.live-anchor-info.clearfix > Div.sub-anchor-info > Div.clearfix > Div.meat-info > Span.num.dv.js-onlines-panel > Span.dv.js-onlines-txt > Span'). Text (),"Flash-cont":d. HTML ()," Picture": Response.doc ('. Active > IMG'). Text (),}
pyspider-Crawling Video Links