python實現網路爬蟲

來源:互聯網
上載者:User
一.簡介

        該爬蟲程式包含2個類,一個管理整個crawling進程(Crawler),一個檢索並解析每一個下載的web頁面(Retriever)。

二.程式
#!/usr/bin/env pythonfrom sys import argvfrom os import makedirs,unlink,sepfrom os.path import dirname,exists,isdir,splitextfrom string import replace,find,lowerfrom htmllib import HTMLParserfrom urllib import urlretrievefrom urlparse import urlparse,urljoinfrom formatter import DumbWriter,AbstractFormatterfrom cStringIO import StringIOclass Retriever(object): #download web pages  def __init__(self,url):    self.url = url    self.file = self.filename(url)  def filename(self,url,deffile='index.htm'):    parsedurl = urlparse(url,'http:',0) ## parse path    path = parsedurl[1] + parsedurl[2]    ext = splitext(path)    if ext[1] == '' : #no file,use default       if path[-1] == '/':          path += deffile       else:          path += '/' + deffile    ldir = dirname(path) #local directory    if sep != '/': # os-indep. path separator       ldir = replace(ldir,'/',sep)    if not isdir(ldir): # create archive dir if nec.       if exists(ldir): unlink(ldir)       makedirs(ldir)    return path  def download(self): #download Web page    try:      retval = urlretrieve(self.url,self.file)    except IOError:      retval = ('*** ERROR: invalid URL "%s"' % \          self.url,)      return retval    def parseAndGetLinks(self): #parse HTML,save links    self.parser = HTMLParser(AbstractFormatter(\       DumbWriter(StringIO())))    self.parser.feed(open(self.file).read())    self.parse.close()    return self.parser.anchorlist    class Crawler(object): #manage entire crawling process  count = 0 #static downloaded page counter    def __init__(self,url):    self.q = [url]       self.seen = []   #have seen the url    self.dom = urlparse(url)[1]    def getPage(self,url):    r = Retriever(url)    retval = r.download()    if retval[0] == '*': # error situation,do not parse      print retval,'... skipping parse'      return     Crawler.count += 1    print '\n(',Crawler.count,')'    print 'URL:',url    print 'FILE:',retval[0]    self.seen.append(url)    links = r.parseAndGetLinks() #get and process links    for eachLink in links:       if eachLink[:4] != 'http' and \          find(eachLink,'://') == -1:          eachLink = urljoin(url,eachLink)       print '* ',eachLink,              if find(lower(eachLink),'mailto:') != -1:          print '... discarded,mailto link'          continue              if eachLink not in self.seen:          if find(eachLink,self.dom) == -1:             print '... discarded, not in domain'          else:             if eachLink not in self.q:                  self.q.append(eachLink)                  print '... new, added to Q'             else:                print '... discarded, already in Q'       else:           print '... discarded, already processed'  def go(self): # process links in queue      while self.q:          url = self.q.pop()          self.getPage(url)def main():  if len(argv) > 1:      url = argv[1]  else:     try:       url = raw_input('Enter starting URL:')     except(KeyboardInterrupt,EOFError):       url = ''     if not url: return      robot = Crawler(url)     robot.go()if __name__ == '__main__':   main()   
相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.