ImportUrllib, Urllib2, Cookielib fromHtmlparserImportHtmlparserImportsysreload (SYS) sys.setdefaultencoding ('UTF8')classWebparser (htmlparser):def __init__(self, Links, path): Htmlparser.__init__(self) self.links=links Self.path=PathdefHandle_starttag (self, Tag, attrs):ifTag = ='a': ifLen (attrs) = =0:Pass Else: for(Key, Val)inchAttrs:ifKey = ='href': ifVal.startswith ('http'): Self.links.add (val)elifVal.startswith ('/'): Self.links.add (Self.path+val)classCrawl:def __init__(self): Self.path='http://www.baidu.com'Self.cookie=Cookielib. Cookiejar () handler=Urllib2. Httpcookieprocessor (Self.cookie) Self.opener=Urllib2.build_opener (handler)defopen (Self, path): Self.response=self.opener.open (path)defShowcookie (self): forIteminchSelf.cookie:Print 'Name ='+Item.namePrint 'value ='+Item.valuedefShowresponse (self):PrintSelf.response.read ()defGetallurl (self, Links, path):Try: Self.open (path) Res=self.response.read () parser=Webparser (links, Path) Parser.feed (res) parser.close ()exceptException, E:PrintedefCrawl (self): Src_links=set () Result_links=set () Self.getallurl (src_links, Self.path) n= 200 whileLen (src_links)! = 0 andn >0:link=Src_links.pop ()ifLinkinchresult_links:Passresult_links.add (link) self.getallurl (src_links, link) n-= 1PrintNreturnResult_links |SRC_LINKSC=Crawl () RLT=c.crawl () forLinkinchRLT:PrintLink
Python practice for extracting URLs