? # Encoding:utf-8
Import OS
Import re
Import subprocess
Import Sys
Import Chardet
Import Scrapy
From scrapy.http import Request
From Scrapy.selector import Htmlxpathselector
From Scrapy.spider import Basespider
From Scrapy.utils.url import URLJOIN_RFC
From Mychardet Import *
# Print sys.getdefaultencoding ()
# Print Sys.path
Def get_default_to_codec ():
Return mytogb18030
Def GetFirst (a):
# print ' Enter GetFirst ', repr (a)
If a = = None:
return U '
Elif len (a) = = 0:
return U '
b = A[0]
# print repr (b) #, Chardet.detect (b)
Return b
Class Greasemonkey1spider (Scrapy. Spider):
Name = "Test"
Allowed_domains = ["localhost"]
Start_urls = (
' Http://localhost/test ',
)
def parsecontext (self, Response):
# print "Enter parsecontext:", Response.url
HXS = response
sel = Hxs.xpath ('//title/text () ')
If sel! = None:
Titles = Sel.extract ()
If Len (titles) > 0:title = titles[0]
Else:title = ' '
sel = Hxs.xpath ('/html/body ')
If sel! = None:
Bodys = Sel.extract ()
If Len (bodys) > 0:body = bodys[0]
Else:body = ' '
# print title, repr (body)
Def parse (self, Response):
BaseURL = Response.url
print ' BaseURL = ', BaseURL
Self.parsecontext (response)
hxs = Response.xpath (R '//a ')
for Path in HXS:
titles = GetFirst (Path.xpath (R ' text () '). Extract ())
URLs = GetFirst (Path.xpath (R ' @href '). Extract ())
# print titles, URLs
item_ url = urljoin_rfc (baseurl, URLs)
yield Request (Item_url,callback=self.parse)
if __name__ = = ' __main__ ':
cmd = ' E:\Python27\Scripts\scrapy.exe crawl--nolog test '
CWD = Os.path.split (__file__) [0]
p = subprocess. Popen (Cmd.split (), stdout=subprocess. PIPE, Stderr=subprocess. PIPE, Shell=false, CWD=CWD)
While None = = P.poll ():
Out, err = P.communicate ()
# print ' Out, err ', off, err, repr (out), repr (ERR)
If err:
Print Err
Elif out:
Print out
Print P.returncode
# While not P.poll ():
# Print P.stdout.read ()
# Print P.stderr.read ()
Scrapy Recursive Download website