Breadth-First algorithm:
#-*-coding:utf-8-*-ImportUrllibImporturllib.request fromBs4ImportBeautifulSoupImportThreadingmylock=Threading. Rlock ()classCrawler:unvisiturl=set () Visitedurl= [] defgethtml (self, URL): HTML="'req= Urllib.request.Request (url, headers = { 'Connection':'keep-alive', 'Accept':'text/html, Application/xhtml+xml, */*', 'Accept-language':'en-us,en;q=0.8,zh-hans-cn;q=0.5,zh-hans;q=0.3', 'user-agent':'mozilla/5.0 (Windows NT 6.3; WOW64; trident/7.0; rv:11.0) Like Gecko' }) Try: Respose= Urllib.request.urlopen (req, timeout = 10) HTML= Respose.read (). Decode ('UTF-8'). Replace (' in',"') exceptException as E:Pass returnhtml; defgeturlfromhtml (self, HTML, SitePath):if(HTML): Soup= BeautifulSoup (HTML,'Html.parser') Alist= Soup.find_all ('a') forAinchalist:Try: ifSitePathincha['href'] anda['href'].startswith ('/ http'): Self.addunvisiturl (a['href']) Self.addvisitedurl (a['href']) exceptKeyerror:Pass #parsing Web page content defAnalysis (self, URL, SitePath): Self.initunvisiturl (URL) while(Len (Self.unvisiturl) >0): Visitingurl=Self.getunvisiturl ()Print(Visitingurl)if(visitingurl): HTML=self.gethtml (Visitingurl)if(HTML):#get all internal links in a Web page, storeself.geturlfromhtml (HTML, SitePath)#Initialize root link definitunvisiturl (self, URL): Self.unVisitUrl.add (URL)defaddunvisiturl (self, url):ifUrl not inchSelf.unvisiturl andUrl not inchself.visitedUrl:self.unVisitUrl.add (URL)defGetunvisiturl (self): URL=None unvisiturltmp=list (Self.unvisiturl)ifunvisiturltmp[0]: URL=unvisiturltmp[0] self.unVisitUrl.remove (URL)returnURLdefaddvisitedurl (self, URL): Self.visitedUrl.append (URL)
Python Single thread crawler code