A few days ago, I published some source code for saving the file. Today, I will show you how to capture the source code of the webpage. All those who have learned java and C know that java captures the source code much simpler than C, and many things are encapsulated in java, you only need to call some interfaces of the URL class to obtain the resources we need. Unlike C, you need to construct your own package and send it again, it is quite troublesome to understand the formats of sending and receiving message packets in HTML... The following is the class for getting source code in my project (simplified ).
/*****/Package com. wyp. HTML;/*** @ author w397090770 * Create Data: 2012-7-17 * Email: wyphao.2007@163.com ** All rights reserved, review not to investigate, but be sure to add these notes when modifying this program. Thank you * for learning and communicating only */import java. io. bufferedReader; import java. io. file; import java. io. fileOutputStream; import java. io. IOException; import java. io. inputStream; import java. io. inputStreamReader; import java. io. outputStream; import java.net. httpURLConnection; import java.net. URL; // import org. apache. log4j. logger; // import org. apache. log4j. propertyConfigurator; import com. wyp. utils. blogReturnStatus; import com. wyp. Utils. pair;/***** common webpage crawling class ** @ author */public class SpiderHTML {// define a logger // static Logger logger = Logger. getLogger (SpiderHTML. class. getName (); // user, used to generate a folder named public static String userName = null; // save path: public static String pathText = null; public SpiderHTML () {// load log4j. properties configuration file // PropertyConfigurator. configure ("log4j. properties ");}/*** several statuses returned by PAGE capturing ** // *** webpage Capturing Method ** @ param urlStri Ng * url to be crawled * @ param charset * webpage encoding method * @ param timeout * @ param type * Get webpage format 0 webpage 1 image * @ param userName * blog user name * @ return: webpage content captured and returned status of webpage Reading * @ throws IOException * capture exception */public static Pair <String, blogReturnStatus> GetWebContent (String urlString, final String charset, int timeout, int type) throws IOException {if (urlString = null | urlString. length () = 0) {return null;} // System. Out. println ("***********************" + urlString); // String imgAbsolutePath = null; blogReturnStatus blogReturnStatus = null; urlString = (urlString. startsWith ("http: //") | urlString. startsWith ("https ://"))? UrlString: ("http: //" + urlString ). intern (); URL url = new URL (urlString); HttpURLConnection conn = (HttpURLConnection) url. openConnection (); // only accept the text/html type. Of course, you can also accept images, pdf, */*, that is, the conn defined in tomcat/conf/web. setRequestProperty ("Accept", "text/html"); // set timeout to timeout millisecond conn. setConnectTimeout (timeout); try {// if it fails if (conn. getResponseCode ()! = HttpURLConnection. HTTP_ OK) {// logger. warn ("Connection failed! URL: ["+ urlString +"] "); blogReturnStatus = BlogReturnStatus. TIME_OUT; return new Pair <String, BlogReturnStatus> (null, blogReturnStatus) ;}} catch (IOException e) {// e. printStackTrace (); // logger. error (e. toString () + "URL: [" + urlString + "]"); blogReturnStatus = BlogReturnStatus. FAILURE; return new Pair <String, BlogReturnStatus> (null, blogReturnStatus);} // logger.info ("Start reading [" + urlString + "]"); // Open the input stream InputStream input = conn. getInputStream (); // set the stream reading function, and set the character set to charsetBufferedReader reader = new BufferedReader (new InputStreamReader (input, charset); String line = null; // used to store the read webpage StringBuffer sb = new StringBuffer (); switch (type) {case 0: // wait until the end of the read webpage Source Code while (line = reader. readLine ())! = Null) {sb. append (line ). append (System. getProperty ("line. separator ");} break; case 1: sb. append (processImg (input, urlString); break; default: System. err. println ("Unsupport File Type! "); // Logger. error (" Unsupport File Type! "); Return null;} // logger.info (" End reading ["+ urlString +"] "); // close the read object if (reader! = Null) {reader. close () ;}if (conn! = Null) {conn. disconnect ();} blogReturnStatus = BlogReturnStatus. OK; // System. out. println (sb); return new Pair <String, BlogReturnStatus> (sb. toString (), blogReturnStatus );} /*** @ param str the webpage file obtained ** this function is used to save the image to the corresponding user's img folder */private static String processImg (InputStream is, String urlString) {String dirs = pathText + File. separator + userName + File. separator + "img" + File. separator; File file = new F Ile (dirs); // The directory does not exist. Create it if (! File. exists () {file. mkdirs ();} // obtain the image name and format String imgNameAndType = urlString. substring (urlString. lastIndexOf ("/") + 1); file = new File (dirs + imgNameAndType); // if (! File. exists () {try {file. createNewFile ();} catch (IOException e) {// TODO Auto-generated catch block // logger. error ("Create" + imgNameAndType + "Failure! "); E. printStackTrace (); return dirs + imgNameAndType;} // logger.info ("Starting Save: [" + imgNameAndType + "]"); OutputStream OS = null; try {OS = new FileOutputStream (file); int bytes =-1; while (bytes = is. read ())! =-1) {OS. write (bytes) ;}} catch (IOException e) {// TODO Auto-generated catch blocke. printStackTrace ();} finally {try {is. close (); OS. close ();} catch (IOException e) {// TODO Auto-generated catch blocke. printStackTrace () ;}// logger.info ("End Save: [" + imgNameAndType + "]");} else {// logger.info ("The file [" + imgNameAndType + "] is exist! ");} Return dirs + imgNameAndType;}/*** class test function ** @ param args * @ throws IOException */public static void main (String [] args) throws IOException {Pair <String, BlogReturnStatus> pair = GetWebContent ("http://blog.csdn.net/w397090770", "UTF-8", 5000, 0); System. out. println (pair. getSecond ());}}
This class is also relatively simple. This class can be used to obtain the webpage source code and images. Pay attention to the differences between the methods for obtaining the image and the webpage source code. The following two classes are used above.
Package com. wyp. utils;/*** @ author w397090770 * Create Data 2012-7-7 * Email: wyphao.2007@163.com * All rights reserved, review not to investigate, but be sure to add these notes when modifying this program. Thank you * for learning and communication only */public class Pair <T1, T2> {private T1 first; private T2 second;/***/public Pair () {// TODO Auto-generated constructor stubfirst = null; second = null ;} /*** @ param first * first part * @ param second * second part */public Pair (T1 first, T2 second) {this. first = first; this. second = second;} public Pair (Pair <T1, T2> pair) {this. first = pair. getFirst (); this. second = pair. getSecond ();} public Pair <T1, T2> make_pair (T1 first, T2 second) {Pair <T1, T2> pair = new Pair <T1, T2> (first, second); return pair;}/*** @ return the first */public T1 getFirst () {return first ;} /*** @ param first * the first to set */public void setFirst (T1 first) {this. first = first;}/*** @ return the second */public T2 getSecond () {return second ;} /*** @ param second * the second to set */public void setSecond (T2 second) {this. second = second ;}}
Package com. wyp. utils;/*** @ author w397090770 * Create Data 2012-7-6 * Email: wyphao.2007@163.com ** All rights reserved, review not to investigate, but be sure to add these notes when modifying this program. Thank you * only for learning and communicating ** this class is used to mark the status of the returned blog address */public enum BlogReturnStatus {// The webpage is successfully obtained OK, // get the webpage Source Code exceeds the set time TIME_OUT, // get the webpage exception FAILURE}