使用HttpClient最新版本,Download
注釋已經寫的比較清楚了,就不再說明了。
爬取到的html內容,既可以用javaRegex截取,也可以用jsoup(:Download),目前是使用jsoup,這個用起來非常方便,不用自己去辛苦的寫Regex。
大家注意了,這種方式有時候會失敗,因為用這種方式過於頻繁登陸renren.com,可能帳號會被限制,而且會要求輸入驗證碼,所以出現驗證碼的話,就無法正常登陸renren.com了。
將方法提取了,這樣便於閱讀。
同時處理了HttpClient的cookie預設策略警示告的異常。
/* * ==================================================================== * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see <http://www.apache.org/>. * renren.com * <input type="hidden" name="origURL" value="http://www.renren.com/home" /> * <input type="hidden" name="domain" value="renren.com" /> * <input type="hidden" name="key_id" value="1" /> * <input type="submit" id="login" class="input-submit login-btn" value="登入人人網" tabindex="5"/> * http://s.xnimg.cn/a36853/n/apps/login/login-all.js */package org.apache.http.examples.client;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.util.ArrayList;import java.util.List;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.NameValuePair;import org.apache.http.ParseException;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.params.ClientPNames;import org.apache.http.cookie.ClientCookie;import org.apache.http.cookie.Cookie;import org.apache.http.cookie.CookieOrigin;import org.apache.http.cookie.CookieSpec;import org.apache.http.cookie.CookieSpecFactory;import org.apache.http.cookie.MalformedCookieException;import org.apache.http.cookie.params.CookieSpecPNames;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;import org.apache.http.impl.cookie.BasicClientCookie;import org.apache.http.impl.cookie.BrowserCompatSpec;import org.apache.http.message.BasicNameValuePair;import org.apache.http.params.HttpParams;import org.apache.http.protocol.HTTP;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.sun.xml.internal.ws.transport.http.client.HttpCookie;/** * * Purpose: * * @author: shihuangzhe.com * @since: JDK 1.6 * @date: 2012-4-28 * */public class RrLogin {/** 帳號 */private static final String userName = "xxxxxx@yahoo.com.cn";/** 密碼 */private static final String password = "xxxxxxxx";/** 網域 */private static final String domain = "renren.com";/** key_id */private static final String keyID = "1";/** 表單提交url */private static String loginURL = "http://www.renren.com/PLogin.do";/** 登陸成功後,跳轉到我自己的blog日誌,人人預設跳轉路徑為 http://www.renren.com/home */private static final String targetUrl = "http://blog.renren.com/blog/84082953/398292611";/** 表單域常量(跳轉url) */private static final String _ORGI_URL = "origURL";/** 表單域常量(網域) */private static final String _DOMAIN = "domain";/** 表單域常量(key_id) */private static final String _KEY_ID = "key_id";/** 表單域常量(帳號) */private static final String _EMAIL = "email";/** 表單域常量(密碼) */private static final String _PASSWORD = "password";/** ThreadSafeClientConnManager保證多安全執行緒 */private DefaultHttpClient client = new DefaultHttpClient(new ThreadSafeClientConnManager());/** response相應 */private HttpResponse response;/** * Purpose: 登陸renren.com * * @throws Exception * @return: void */private boolean login(String userName, String password) {boolean isLogin = false;HttpPost httpost = new HttpPost(loginURL);// 為請求參數賦值List<NameValuePair> nvps = new ArrayList<NameValuePair>();nvps.add(new BasicNameValuePair(_ORGI_URL, targetUrl));nvps.add(new BasicNameValuePair(_DOMAIN, domain));nvps.add(new BasicNameValuePair(_KEY_ID, keyID));nvps.add(new BasicNameValuePair(_EMAIL, userName));nvps.add(new BasicNameValuePair(_PASSWORD, password));try {httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));// 擷取請求相應response = client.execute(httpost);System.out.println(response.getStatusLine());// 返回302// 設定cookie,renren.com用於身分識別驗證的cookie有兩個,名字分別是p和t.// HttpClientParams.setCookiePolicy(client.getParams(),// CookiePolicy.BROWSER_COMPATIBILITY);// 因為HttpClient 4.0預設cookie策略會報WARN警告,所以手動定製cookie策略CookieSpecFactory csf = new CookieSpecFactory() {public CookieSpec newInstance(HttpParams params) {return new BrowserCompatSpec() {@Overridepublic void validate(Cookie cookie, CookieOrigin origin)throws MalformedCookieException {// nothing to do}};}};client.getCookieSpecs().register("easy", csf);client.getParams().setParameter(ClientPNames.COOKIE_POLICY, "easy");isLogin = true;} catch (UnsupportedEncodingException e) {System.err.println("UnsupportedEncodingException!");} catch (ClientProtocolException e) {System.err.println("ClientProtocolException!");} catch (IOException e) {System.err.println("IOException!");} finally {httpost.abort();}return isLogin;}/** * Purpose: 擷取blog內容 * * @param response * @return: String */private void showResult(String userName, String password) {try {if (!login(userName, password)) {System.err.println("登陸失敗!");System.exit(0);}/* * 注意,因為renren.com登陸成功後,需要再次經過 * http://www.renren.com/callback.do?t=da278e2526f9b2387ea22e57578a85d93 * & * origURL=http%3A%2F%2Fblog.renren.com%2Fblog%2F84082953%2F398292611 * &needNotify=false 這種方式跳轉,所以需要再次處理髮一次請求 */// 列印所有相應頭Header[] headers = response.getAllHeaders();for (int i = 0; i < headers.length; i++) {Header header = headers[i];System.out.println(header.getName() + ": " + header.getValue());}// 擷取真實跳轉路徑Header locationHeader = response.getFirstHeader("Location");// 頻繁登陸,failCode = 512,就會要求輸入驗證碼登陸了HttpGet httpget = new HttpGet(locationHeader.getValue());HttpResponse response2 = client.execute(httpget);System.out.println(response2.getStatusLine()); // HTTP/1.1 200 OK// 擷取EntityHttpEntity entity = response2.getEntity();// 解析html,拿出blogString[] context = htmlToPlainText(EntityUtils.toString(entity));System.out.println("---------解析後的內容----------- ");System.out.print("Title: ");System.out.println(context[0]);System.out.print("Context: ");System.out.println(context[1]);} catch (ClientProtocolException e) {System.err.println("ClientProtocolException!");} catch (ParseException e) {System.err.println("ParseException!");} catch (IOException e) {System.err.println("IOException!");} finally {// When HttpClient instance is no longer needed,// shut down the connection manager to ensure// immediate deallocation of all system resourcesclient.getConnectionManager().shutdown();}}// /**// * Purpose: 正則提取blog內容// * @param orgTest// * @return: String// */// private String printBlog(String orgTest) {// // 正則匹配規則// // String regexp = "<div\\s*id=\"blogContent\"\\s*[^>]*>(.+?)</div>";// // String regexp =// "(<div id=\"blogContent\" class=\"text-article\")(.+?)( </div>)";// Pattern pattern =// Pattern.compile("<div\\s*id=\"blogContent\"\\s*[^>]*>(.+?)</div>");// Matcher m = pattern.matcher(orgTest);// if (!m.find()) {// return null;// }// return m.group(0);// }/** * Purpose: 使用jsoup解析Html * * @param html * @return: String[] */private static String[] htmlToPlainText(String html) {String[] content = new String[] { "", "" };Document doc = Jsoup.parse(html);// 提取blog標題Elements titles = doc.select("h3.title-article>strong");for (Element oneSelect : titles)content[0] += oneSelect.text();// 提取blog內容Elements contents = doc.select("div#blogContent");for (Element oneSelect : contents)content[1] += oneSelect.text();return content;}/** * Purpose: 測試 * * @param args * @return: void */public static void main(String[] args) {RrLogin renRen = new RrLogin();renRen.showResult(userName, password);}}