Package COM. th. spider. test; import Java. io. bufferedoutputstream; import Java. io. fileoutputstream; import Org. apache. commons. logging. log; import Org. apache. commons. logging. logfactory; import Org. apache. HTTP. httpentity; import Org. apache. HTTP. httpresponse; import Org. apache. HTTP. httpstatus; import Org. apache. HTTP. client. httpclient; import Org. apache. HTTP. client. methods. httpget; import Org. apache. HTTP. impl. clie NT. defaulthttpclient; import Org. apache. HTTP. params. coreconnectionpnames; import Org. apache. HTTP. util. entityutils; import Org. jsoup. connection; import Org. jsoup. jsoup; import Org. jsoup. nodes. document; import Org. jsoup. nodes. element; import Org. jsoup. select. elements; public class exmaple3 {Private Static final log = logfactory. getlog (exmaple3.class);/*** directory for storing captured images */Private Static final string pic_dir = "/Home/Li/PIC";/*** link timeout */Private Static final int time_out = 5000; static void go3 (string URL) throws exception {connection conn = jsoup. connect (URL); document DOC = Conn. get (); elements links = Doc. select ("Div. piclist IMG [SRC] "); For (INT I = 0; I <links. size (); I ++) {element = links. get (I); final string imgurl = element. ATTR ("src"); log.info (imgurl); thread. sleep (500); New thread (New runna Ble () {public void run () {try {save (imgurl);} catch (exception e) {// todo auto-generated Catch Block E. printstacktrace ();}}}). start () ;}} static void go2 (string URL) throws exception {connection conn = jsoup. connect (URL); document DOC = Conn. get (); elements links = Doc. select ("Div. cc a [href] "); For (INT I = 0; I <links. size (); I ++) {element = links. get (I); final string dirurl = "http :// Www.3lian.com "+ element. ATTR ("href"); log.info (dirurl); thread. sleep (500); New thread (New runnable () {public void run () {try {connection conn = jsoup. connect (dirurl); document DOC = Conn. get (); elements images = Doc. select ("Div. mb_jjnr IMG [SRC] "); For (Int J = 0; j <images. size (); j ++) {element IMG = images. get (j); string imgurl = IMG. ATTR ("src"); log.info (imgurl); Save (imgurl) ;}} catch (exception E) {E. printstacktrace ();}}}). start () ;}}/*** process Post URL * @ Param URL * @ throws exception */static void go (string URL) throws exception {// connection conn = jsoup created by jsop. connect (URL); // The request returns the entire document object document DOC = Conn. post (); // select all class = Zoom's IMG Tag object elements IMGs = Doc. select ("IMG [class = zoom]"); // loop each IMG label for (INT I = 0; I // Doc. select ("IMG [class = zoom]")/*** Save image * @ Param URL * @ Param I * @ throws exception */static void save (string URL) throws exception {string filename = URL. substring (URL. lastindexof ("/"); string filepath = pic_dir + "/" + filename; bufferedoutputstream out = NULL; byte [] bit = getbyte (URL); If (bit. length> 0) {try {out = new bufferedoutputstream (New fileoutputstream (filepath); out. write (BIT); out. flush (); log.info ("Create File success! ["+ Filepath +"] ");} finally {If (OUT! = NULL) out. close () ;}}/*** get the image byte stream * @ Param URI * @ return * @ throws exception */static byte [] getbyte (string URI) throws exception {httpclient client = new defaulthttpclient (); client. getparams (). setparameter (coreconnectionpnames. connection_timeout, time_out); httpget get = new httpget (URI); get. getparams (). setparameter (coreconnectionpnames. connection_timeout, time_out); try {httpresponse resonse = Cl Ient.exe cute (get); If (resonse. getstatusline (). getstatuscode () = httpstatus. SC _ OK) {httpentity entity = resonse. getentity (); If (entity! = NULL) {return entityutils. tobytearray (entity) ;}} catch (exception e) {e. printstacktrace ();} finally {client. getconnectionmanager (). shutdown ();} return New byte [0];} public static void main (string [] ARGs) throws exception {// start capturing images go2 ("http://www.3lian.com/gif/more/03/0301.html "); // go3 ("http://www.ivsky.com/tupian/nvxing_gouwu_qingjing_v6969 ");}}
Main jar packages required
Httpclient-4.0.1jar jsoup-1.5.2.jar
Go go2 go3 corresponds to the capture of different formats, so you can see the reason.