/*** @ (#) Searchcrawler.java, April 12, 2016. Copyright Youdao, Inc. All rights * reserved. Youdao proprietary/confidential. Use was * subject to license terms. */ PackageTestzk;ImportJava.util.*;Importjava.net.*;ImportJava.io.*;/** * * @authorZhoukang*/ Public classSearchcrawlerextendsThread {Privatehashmap<string, arraylist<string>> Disallowlistcache =NewHashmap<string, arraylist<string>>(); PrivateList<string>urllist; Private StaticFile Resultfile =NewFile ("Result.txt"); Private StaticBufferedWriter Writer; Static { Try { if(!resultfile.exists ()) {Resultfile.createnewfile (); } writer=NewBufferedWriter (NewFileWriter (resultfile)); } Catch(Exception e) {//TODO: } } PublicSearchcrawler (String str,BooleanFilethrowsioexception{urllist=NewArraylist<string>(); if(file) {file F=NULL; BufferedReader Reader=NULL; Try{f=NewFile (str); Reader=NewBufferedReader (NewFileReader (f)); String Line= ""; while(Line! =NULL) { line=Reader.readline (); Urllist.add (line); } } Catch(Exception e) {//TODO}finally { if(Reader! =NULL) {reader.close (); } } } Else{urllist.add (str); } } PublicBufferedWriter Getbufferedwriter () {returnwriter; } Public voidrun () {checkurl (urllist); } Private voidCheckurl (list<string>URLs) {Iterator<String> Urliter =Urls.iterator (); while(Urliter.hasnext ()) {String URL=Urliter.next (); if(url = =NULL|| Url.equals ("")) { Continue; } URL=removewwwfromurl (URL); URL Verifiedurl=verifyurl (URL); System.out.println (URL); Try { if(isrobotallowed (Verifiedurl)) {writer.write (URL+ ": true"); } Else{writer.write (URL+ ": false"); } writer.newline (); Writer.flush (); } Catch(Exception e) {//TODO: } } } Privateurl verifyurl (String url) {if(!url.tolowercase (). StartsWith ("http://")) return NULL; URL Verifiedurl=NULL; Try{Verifiedurl=Newurl (URL); } Catch(Exception e) {return NULL; } returnVerifiedurl; } Private Booleanisrobotallowed (URL urltocheck) {String host=urltocheck.gethost (). toLowerCase (); ArrayList<String> disallowlist =Disallowlistcache.get (host); if(Disallowlist = =NULL) {disallowlist=NewArraylist<string>(); Try{URL Robotsfileurl=NewURL ("HTTP +/" + host + "/robots.txt")); BufferedReader Reader=NewBufferedReader (NewInputStreamReader (Robotsfileurl.openstream ())); String Line; while(line = Reader.readline ())! =NULL) { if(Line.indexof ("Disallow:") = = 0) {String Disallowpath= Line.substring ("Disallow:". Length ()); intCommentindex = Disallowpath.indexof ("#"); if(Commentindex! =-1) {Disallowpath= disallowpath.substring (0, Commentindex); } Disallowpath=Disallowpath.trim (); Disallowlist.add (Disallowpath); }} disallowlistcache.put (host, disallowlist); } Catch(Exception e) {return true; }} String file=Urltocheck.getfile (); for(inti = 0; I < disallowlist.size (); i++) {String Disallow=Disallowlist.get (i); if(File.startswith (Disallow)) {return false; } } return true; } Privatestring removewwwfromurl (string url) {intindex = Url.indexof ("://www.")); if(Index! =-1) { returnUrl.substring (0, Index + 3) + url.substring (index + 7); } return(URL); } Private Static voidAddshutdownhook (FinalSearchcrawler Searchcrawler) {Runtime.getruntime (). Addshutdownhook (NewThread () { Public voidrun () {BufferedWriter writer=Searchcrawler.getbufferedwriter (); Try{writer.close (); } Catch(Exception e) {//TODOSYSTEM.OUT.PRINTLN ("Add error"); } } }); } Public Static voidMain (string[] args)throwsinterruptedexception, ioexception{if(Args.length! = 1 && args.length! = 2) {System.out. println ("Usage-1:java searchcrawler url"); System.out. println ("Usage-2:java searchcrawler-f filename"); return; } searchcrawler crawler=NULL; if(Args.length = = 1) {crawler=NewSearchcrawler (Args[0],false); } Else{crawler=NewSearchcrawler (Args[1],true); } addshutdownhook (crawler); Crawler.setdaemon (true); Crawler.start (); Crawler.join (); }}
Java Robots Protocol Detection Tool