終於寫了個小的爬蟲程式,mark一下。只是實現根據url擷取網頁資訊的功能。
mport org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.HttpConnectionParams;import org.apache.http.util.EntityUtils;import org.apache.log4j.Logger;/* * 擷取網頁資訊 */public class DownloadPage{ public String getContentFormUrl(String url,Logger log) { /* 執行個體化一個HttpClient用戶端 */ HttpClient client = new DefaultHttpClient(); HttpGet getHttp = new HttpGet(url); String content = null; HttpResponse response; int i=0; for(i=0;i<3;i++) { try { /*逾時設定*/ client.getParams().setIntParameter(HttpConnectionParams.SO_TIMEOUT,3000); //逾時設定 client.getParams().setIntParameter(HttpConnectionParams.CONNECTION_TIMEOUT, 3000);//連線逾時 /*獲得資訊載體*/ response = client.execute(getHttp); HttpEntity entity = response.getEntity(); if(entity!=null) { /* 轉化為文本資訊 */ content = EntityUtils.toString(entity); //System.out.print(content); log.info("擷取"+url+"資訊!"); //log.info(content); client.getConnectionManager().shutdown(); return content; } } catch (Exception e) { log.error("Exception 訪問"+url+ e.toString()); } finally { client.getConnectionManager().shutdown(); } } return content; } }