The following is the source
PackageWebspider;ImportJava.io.BufferedReader;Importjava.io.IOException;ImportJava.io.InputStreamReader;Importjava.net.MalformedURLException;ImportJava.net.URL;ImportJava.nio.charset.Charset;ImportJava.util.regex.Matcher;ImportJava.util.regex.Pattern;/*** web crawler *@authorVCANCCC **/ Public classWebspidertest {/*** Access to URLSTR network content *@paramUrlstr *@return */ Public Staticstring Geturlcontent (String urlstr, String charset) {StringBuilder sb=NewStringBuilder (); Try{URL URL=NewURL (URLSTR); BufferedReader Reader=NewBufferedReader (NewInputStreamReader (Url.openstream (), Charset.forname (Charset)); String Temp= ""; while(temp = Reader.readline ())! =NULL) {sb.append (temp); } } Catch(malformedurlexception e) {e.printstacktrace (); } Catch(IOException e) {e.printstacktrace (); } returnsb.tostring (); } Public Static voidMain (string[] args) {String deststr= Geturlcontent ("https://www.taobao.com", "Utf-8"); //Pattern p = pattern.compile ("<a[\\s\\S]+?</a>");Pattern p = pattern.compile ("Href=\" (. +?) \""); Matcher m=P.matcher (DESTSTR); while(M.find ()) {System.out.println (M.group (1)); } }}
Java Crawler Simple implementation