The first step is to implement the Linkqueue, filter and store the URL of the operation
Import java.util.ArrayList;
Import java.util.Collections;
Import Java.util.HashSet;
Import java.util.List;
Import Java.util.Set;
public class Linkqueue {
//visited URL collection
private static set<string> Visitedurl = Collections.synchronizedset (New hashset<string> ());
URLs that are not accessed
private static list<string> Unvisitedurl = Collections.synchronizedlist (New arraylist<string > ());
Unreachable URL out queue public
static String Unvisitedurldequeue () {
if (unvisitedurl.size () > 0) {
String url = u Nvisitedurl.remove (0);
Visitedurl.add (URL);
return URL;
return null;
}
The new URL is added in to validate to ensure that only one public
static void Addunvisitedurl (String url) {
if (URL!= null &&!url.t) is added once. Rim (). Equals ("") &&!visitedurl.contains (URL)
&&!unvisitedurl.contains (URL))
Unvisitedurl.add (URL);
Determines whether the unreachable URL queue is an empty public
static Boolean unvisitedurlsempty () {return
unvisitedurl.isempty ();
}
}
The second step is to collect the links under each URL to filter the resulting new links
Import Java.util.HashSet;
Import Java.util.Set;
Import Org.htmlparser.Node;
Import Org.htmlparser.NodeFilter;
Import Org.htmlparser.Parser;
Import Org.htmlparser.filters.NodeClassFilter;
Import Org.htmlparser.filters.OrFilter;
Import Org.htmlparser.tags.LinkTag;
Import org.htmlparser.util.NodeList;
Import org.htmlparser.util.ParserException;
/** * Filter http URL, get URL that can match the rule * @author Administrator * */public class Parserhttpurl {//Get a link on a website, filter used to filter links public static set<string> extraclinks (String URL, linkfilter filter) {set<string> links = new hashset<
String> ();
try {Parser Parser = new Parser (URL); Filter <frame > Label filters, which are used to extract the links represented by the SRC attribute in the frame label nodefilter framefilter = new Nodefilter () {public bool
Ean Accept (node node) {if (Node.gettext (). StartsWith ("Frame src=")) {return true;
else {return false;
}
}
}; Orfilter to set filters <a> tags, and <frame> tags orfilter linkfilter = neW Orfilter (New Nodeclassfilter (Linktag.class), framefilter);
Get all filtered labels nodelist list = Parser.extractallnodesthatmatch (Linkfilter);
for (int i = 0; i < list.size (); i++) {Node tag = List.elementat (i);
if (tag instanceof Linktag)//<a> tag {linktag link = (linktag) tag;
String Linkurl = Link.getlink ();//URL if (filter.accept (Linkurl)) Links.add (Linkurl); else//<frame> Tags {//extracts the SRC attributes in the frame link such as <frame src= "test.html"/> String frame = Tag.gette
XT ();
int start = Frame.indexof ("src=");
frame = frame.substring (start);
int end = Frame.indexof ("");
if (end = = 1) end = Frame.indexof (">");
String Frameurl = frame.substring (5, end-1);
if (Filter.accept (Frameurl)) Links.add (Frameurl);
A catch (Parserexception e) {e.printstacktrace ());
return links; }
}
The third step, realize the picture downloading function
Import Java.io.File;
Import Java.io.FileOutputStream;
Import Java.io.InputStream;
Import Java.net.URL;
Import java.net.URLConnection;
Import java.util.ArrayList;
Import java.util.List;
Import Java.util.regex.Matcher;
Import Java.util.regex.Pattern; /*** * Java Crawl network picture * * @author swinglife * */public class Downloadpic {//code private static final String Ecodin
G = "UTF-8"; Get the IMG tag regular private static final String Imgurl_reg = "]*?> "; Get the regular private static final String Imgsrc_reg = "http:\" for the SRC path? (.*?)
(\ "|>|\\s+");
public static void Downloadpic (string url) {//Get HTML text content string html = null;
try {HTML = downloadpic.gethtml (URL);
catch (Exception e) {e.printstacktrace (); } if (null!= HTML &&! "".
Equals (HTML)) {//Get Picture label list<string> Imgurl = Downloadpic.getimageurl (HTML);
Get picture src address list<string> imgsrc = DOWNLOADPIC.GETIMAGESRC (Imgurl);
Download picture downloadpic.download (IMGSRC); }
/*** * Get HTML content * * @param URL * @return * @throws Exception/private static string gethtml (String u
RL) throws Exception {URL uri = new URL (URL);
URLConnection connection = Uri.openconnection ();
InputStream in = Connection.getinputstream ();
byte[] buf = new byte[1024];
int length = 0;
StringBuffer sb = new StringBuffer ();
while (length = In.read (buf, 0, buf.length)) > 0) {sb.append (new String (BUF, ecoding));
} in.close ();
return sb.tostring (); /*** * Get imageurl Address * * @param HTML * @return/private static list<string> Getimageurl (String HTML
) {Matcher Matcher = Pattern.compile (Imgurl_reg). Matcher (HTML);
list<string> Listimgurl = new arraylist<string> ();
while (Matcher.find ()) {Listimgurl.add (Matcher.group ());
return listimgurl; /*** * Get imagesrc Address * * @param listimageurl * @return * * private static list<string> getimagesrc (Lis T<string> listimageurl) {List<string> listimgsrc = new arraylist<string> ();
for (String image:listimageurl) {Matcher Matcher = Pattern.compile (Imgsrc_reg). Matcher (image);
while (Matcher.find ()) {Listimgsrc.add (Matcher.group (). substring (0, Matcher.group (). Length ()-1));
} return LISTIMGSRC; /*** * Download Picture * * @param listimgsrc * * private static void Download (List<string> listimgsrc) {for (S)
Tring url:listimgsrc) {try {String imagename = url.substring (Url.lastindexof ("/") + 1, url.length ());
URL uri = new URL (URL);
InputStream in = Uri.openstream ();
FileOutputStream fo = new FileOutputStream (new File (imagename));
byte[] buf = new byte[1024];
int length = 0;
while (length = In.read (buf, 0, Buf.length))!=-1) {fo.write (buf, 0, length);
} in.close ();
Fo.close ();
catch (Exception e) {e.printstacktrace (); }
}
}
}
Real filter interface, define filtering interface:
Public interface Filter {public
boolean accept (String URL);
}
The fourth step, the implementation of filtering rules:
public class Crawler {/** * Crawl process * * @return * @param seeds/public void crawling (String URL) {//define Filter
Filter filter = new Filter () {public boolean accept (String URL) {//filter rules change with the rules of the Web site you want to crawl, recommended to be implemented, I was climbing Douban station
if (Url.indexof ("Douban.com/group/topic")!=-1 | | url.indexof ("Douban.com/group/haixiuzu/discussion?start")!=-1)
return true;
else return false;
}
};
Initializes the URL queue linkqueue.addunvisitedurl (URL); Loop condition, the link to crawl is not empty while (!
Linkqueue.unvisitedurlsempty ()) {//Team header URL out queue string visiturl = (string) linkqueue.unvisitedurldequeue ();
if (Visiturl = = null) continue;
Downloadpic.downloadpic (Visiturl);
Extracts the URL set<string> links = parserhttpurl.extraclinks (Visiturl, filter) in the download Web page;
The new, unread URL is queued for (String link:links) {linkqueue.addunvisitedurl. link);
}}//Main method entry public static void main (string[] args) {Crawler Crawler = new Crawler (); Crawler.crawling ("Http://www.douban.com/group/haixiuzu/discussion?start=0 ");
}
}