#-*-Coding:utf-8-*-
# python:2.x
__author__ = ' Administrator '
Import Urllib2
#例子
Login= ' WeSC '
Passwd= "You ' llneverguess"
Url= ' http://localhost '
def h1 (URL):
From Urlparse import Urlparse as Up
Hdlr=urllib2. Httpbasicauthhandler ()
Hdlr.add_password (' Archives ', Up (URL) [1],login,passwd)
Opener=urllib2.build_opener (HDLR)
Urllib2.install_opener (opener)
Return URL
def req (URL):
From Base64 import encodestring as S
Req1=urllib2. Request (URL)
B64str=s ('%s:%s '% (LOGIN,PASSWD)) [: -1]#-*-coding:utf-8-*-
# python:2.x
__author__ = ' Administrator '
#python web crawler
#python web crawler
Import sys,os,string
From Htmllib import Htmlparser
From Urllib import Urlretrieve
From urlparse Import *
From Formatter import *
From Cstringio Import *
Class Retruver (object): #下载页面
def __init__ (Self,url):
Self.url=url
Self.file=self.filename (URL)
def filename (selfself,url,deffile= ' index.htm '):
Parsedurl=urlparse (URL, ' http: ', 0) #parse path
PATH=PARSEDURL[1]+PARSEDURL[2]
Ext=os.path.splitext (PATH)
If ext[1]== ': #没有路径
If path[-1]== '/':
Path+=deffile
Else
path+= '/' +deffile
Ldir=os.path.dirname (path) #local directory
If os.sep!= '/': #os-indep.path Separator
Ldir=string.replace (Ldir, '/', OS.SEP)
If not Os.path.isdir (ldir): #create archive dir if NEC
If Os.path.exists (ldir): Os.unlink (Ldir)
Os.makedirs (Ldir)
return path
def download (self): #下载页面
Try
Retval=urlretrieve (Self.url,self.file)
Except IOError:
Retval= (' ***error:invalid url%s '%\
Self.url,)
return retval
def parseandgetlinks (self): #parse HTML, save links
Self.parser=htmlparser (Abstractformatter (\
Dumbwriter (Stringio ())))
Self.parser.feed (Open (self.file). Read ())
Self.parser.close ()
Return self.parser.anchorlist
Class Crawler (object): #manage entire crawling process
Count=0#static Downloaded page counter
def __init__ (Self,url):
Self.q=[url]
Self.seen=[]
Self.dom=urlparse (URL) [1]
def getpage (Self,url):
R=retruver (URL)
Retval=r.download ()
If retval[0]== ' * ': #error situation,do not parse
Print retval, ' ... skipping parse '
Return
Crawler.count+=1
print ' \ n (', Crawler.count, ') '
print ' url:%s '% (URL)
print ' file%s '% (retval[0])
Self.seen.append (URL)
Lineks=r.parseandgetlinks () #get and process links
For Eachlink in Lineks:
If eachlink[:4]!= ' http ' and \
String.find (Eachlink, '//') ==-1:
Eachlink=urljoin (Url,eachlink)
print ' *%s '% (Eachlink,)
If String.find (String.Lower (eachlink), ' mailto: ')!=-1:
print ' ... Discarded,mailto lone '
Continue
If Eachlink not in Self.seen:
If String.find (eachlink,self.dom) ==-1:
print ' ... discarded,not in domain '
Else
If Eachlink not in SELF.Q:
Self.q.append (Eachlink)
print '.. new,added to Q '
Else
print '.. Dircarded,already in Q '
Else
print ' OK '
def go: #process Linek in queue
While SELF.Q:
Url=self.q.pop ()
Self.getpage (URL)
def main ():
If Len (SYS.ARGV) >1:
URL=SYS.ARGV[1]
Else
Try
Url=raw_input (' URL: ')
Except (Keyboardinterrupt,eoferror):
Url= "
If not Url:return
Robot=crawler (URL)
Robot.go ()
Main ()
Req1.add_header (' Authorization ', ' Basic%s '%b64str)
Return req1
For s in (' handler ', ' request '):
print ' ***using%s: '%s.upper ()
Url=eval (' req ') (URL)
F=urllib2.urlopen (URL)
Print F.readline ()
F.close ()
Python crawler, Python web crawler