1 fromHtmlparserImportHtmlparser2 ImportUrllib23 ImportRe4 fromTimeImportSleep5 6 7 classMyhtmlparser (htmlparser):8 def __init__(self):9Htmlparser.__init__(self)TenSelf.links = [] One A defHandle_starttag (self, Tag, attrs): - #print "Encountered the beginning of a%s tag"% tag - ifTag = ="a": the ifLen (attrs) = = 0:Pass - Else: - for(Variable,value)inchAttrs: - ifVariable = ='href': + ifValue.find ('/vul/list/page/') +1: - self.links.append (value) + A classPagecontentparser (htmlparser): at def __init__(self): -Htmlparser.__init__(self) -Self.flag =0 -Self.urllist = []#URL -Self.title = []#title - defHandle_starttag (self, Tag, attrs): in #print "Encountered the beginning of a%s tag"% tag - ifTag = ="a": to ifLen (attrs) = = 0:Pass + Else: - for(Variable,value)inchAttrs: the ifValue.find ('/vul/info/qid/qtva') +1: *Self.flag = 1 $ self.urllist.append (value)Panax Notoginseng #URL write to native -Sleep (0.5) thef = open ("./butian.html",'A +') +F.write ("http://loudong.360.cn"+value+"---------") A F.close the defHandle_data (self,data): + ifSelf.flag = = 1: -y = Data.decode ("Utf-8") $K = Y.encode ("GB18030") $ Self.title.append (k) -Self.flag =0 - #Storage to native theSleep (0.5) -f = open ('./butian.html','A +')WuyiF.write (k +"<br>") the F.close - Wu - About if __name__=="__main__": $ Print "start ... .." -Content = Urllib2.urlopen ("http://loudong.360.cn/vul/list/"). Read () -HP =Myhtmlparser () - hp.feed (content) Anum = Int (filter (str.isdigit, hp.links[-1]))#get The total page + hp.close () the PrintNum - forIinchrange (num): $page = i + 1 thePageContent = Urllib2.urlopen ("http://loudong.360.cn/vul/list/page/"+Str (page)). Read () thepage =Pagecontentparser () the page.feed (pagecontent) thePage.close ()
The effect is as follows:
The first version of the crawler, crawl the days of the vulnerability link and title