HttpClient Crawl Page Source code

Source: Internet
Author: User
Package util;

The import Java.io.BufferedReader;
Abnormal import java.io.IOException;
Java.io.InputStream of imports;
The import Java.io.InputStreamReader;
The import Java.text.DateFormat;
Import of Java.text.SimpleDateFormat;
Java.util.ArrayList of imports;
Import Java.util.Date;
Java.util.HashMap of imports;
Java.util.List of imports;
Java.util.Map of imports;
The import Java.util.Set;
Import Java.util.Map.Entry;
Import Java.util.zip.GZIPInputStream;

Import Org.apache.commons.httpclient.Header;
Import org.apache.commons.httpclient.HttpClient;
Import org.apache.commons.httpclient.HttpException;
Import Org.apache.commons.httpclient.HttpMethod;
Import Org.apache.commons.httpclient.HttpStatus;
Import Org.apache.commons.httpclient.NameValuePair;
Import Org.apache.commons.httpclient.SimpleHttpConnectionManager;
Import Org.apache.commons.httpclient.methods.GetMethod;
Import Org.apache.commons.httpclient.methods.PostMethod;
Import Org.apache.commons.httpclient.params.HttpConnectionManagerParams;
Import Org.apache.commons.httpclient.params.HttpMethodParams;

/ **
* @author Six Flavors
* Date: December 18, 2009
*
* TODO
* Auxiliary class of HttpClient
* /
public class Httpclienthelper
{

/ **
* HttpClient connection Timeout, read data time-out setting (unit: milliseconds)
* /
public static final Interpretation httpclient_connection_timeout = 30000;
public static final Interpretation httpclient_so_timeout = 120000;
public static final Interpretation httpmethod_so_timeout = 5000;

Let the ConnectionManager manage httpclientconnection when the connection is closed
private static Boolean alwaysclose = FALSE;
private static string Defaultencode = "UTF-8";

private static final DateFormat Date_format = new SimpleDateFormat ("Yyyy-mm-dd HH:MM:SS");

/ **
* Get the HttpClient connection and set the relevant parameters
*
* @return
* /
Gethttpclient of public static httpclient ()
{
HttpClient client = new HttpClient (new Simplehttpconnectionmanager (Alwaysclose));
Httpconnectionmanagerparams managerparams = Client.gethttpconnectionmanager () getparams () method.
Setting the connection time-out (in milliseconds)
Managerparams.setconnectiontimeout (httpclient_connection_timeout);
Set read data time-out (in milliseconds)
Managerparams.setsotimeout (httpclient_so_timeout);
return to the client;
}

/ **
* Get the HttpClient connection and set the relevant parameters
*
* @ parameter Logonsite
* @ parameter Logonport
* @ Parameter Protocol
* @return
* /
public static HttpClient gethttpclient (last string logonsite, Final interpretation Logonport, last string protocol)
{
HttpClient client = new HttpClient (new Simplehttpconnectionmanager (Alwaysclose));
Client.gethostconfiguration () Sethost (logonsite,logonport, protocol).
Httpconnectionmanagerparams managerparams = Client.gethttpconnectionmanager () getparams () method.
Setting the connection time-out (in milliseconds)
Managerparams.setconnectiontimeout (httpclient_connection_timeout);
Set read data time-out (in milliseconds)
Managerparams.setsotimeout (httpclient_so_timeout);
return to the client;
}

private static List < title > getheaders (Map <string, string > Header)
{
List < title > = header of ArrayList new < title > ();
Boolean includeuseragent = FALSE;
if (empty = header && false = = Header.isempty ()! )
{
Set < Enter < String, string >> = EntrySet header.entryset ();
For (enter <string, String > Item: EntrySet)
{
if (false = = Includeuseragent
&& "User Agent". Equals (Entry.getkey ()))
{
Includeuseragent = TRUE;
}
Headers.add (New Header (Entry.getkey (), Entry.getvalue ()));
}
}

if (false = = Includeuseragent)
{
Headers.add (new title (
"User Agent",
"Mozilla's/4.0 (compatible; MSIE 7.0; Windows NT 5.1; GTB5;. NET CLR 1.1.4322;. NET CLR 2.0 0.50727; Alexa tool strip; MAXTHON 2.0));
}
return head;
}

private static Namevaluepair [] getpairs (Map <string, Strings > PostData)
{
if (empty = = | | PostData Postdata.isempty ())
{
returns null;
}

Set < input < string, string >> = EntrySet postdata.entryset ();
INT datalength = Entryset.size ();
Namevaluepair [] = to the new Namevaluepair [Datalength]
INT I = 0;
For (enter <string, String > Item: EntrySet)
{
double [i + +] = new Namevaluepair (Entry.getkey (), Entry.getvalue ());
}
return to;
}

/ **
* Request Page Content information
*
* HttpClient of the @ parameter
* @ parameter Requrl
* Parameter title
* @ parameter PostData
* Parameter Code
* @return
* /
public static string DoRequest (HttpClient of HttpClient, String Requrl,
Map < String, string> header, map <string, String > PostData, String encoding)
{
string htmlcontent = NULL;
if (empty = = HttpClient)
{
return to Htmlcontent;
}

Request Encoding Settings
encoding = (Empty = = Encoding Defaultencode: encoding);

Header Request Information
List < title > = Head getheaders (head);

System.out.println ("[" + Date_format.format (new DATE ()) + "]-dorequest-" + Requrl);

Mode of delivery
, if (null = postdata! )
{
The postmethod of Postmethod = new Encodepostmethod (requrl, encoding);
For (Head Tempheader: Head)
{
Postmethod.setrequestheader (Tempheader);
}

Post-parameter settings
Namevaluepair [] = PARAMS getpairs (postdata);
if (null = parameter!) )
{
Postmethod.setrequestbody (PARAMS);
}

Extract Web content
Htmlcontent = Executemethod (httpclient, posterior method, encoding, Getwebsite (Requrl));
}
Other
{
GetMethod GetMethod = new Implementation GetMethod (Requrl);
For (Head Tempheader: Head)
{
Getmethod.setrequestheader (Tempheader);
}

Extract Web content
Htmlcontent = Executemethod (httpclient, GetMethod, encoded, NULL);
}
return to Htmlcontent;
}

private static string Getwebsite (String requrl)
{
String Web site = NULL;
if (empty = = Requrl | | Requrl.isempty ())
{
return to the website;
}

string prefix = "http://";
if (requrl.startswith (prefix))
{
int index = requrl.substring (Prefix.length ()) indexof ("/") + prefix.length ();
Website = requrl.substring (0, index);
}
return to the website;
}

/ **
* Get Web content by listing HttpMethod
*
* HttpClient of the @ parameter
* @ parameter Requestmethod
* Parameter Code
* Parameters of the website
* @return
* /
private static string Executemethod (HttpClient HttpClient, enumeration HttpMethod Requestmethod, encoded string, string web)
{
string responsecontent = NULL;
if (empty = = HttpClient)
{
return to Responsecontent;
}

Determine whether to request encrypted data
The Boolean dataencrypt = FALSE;
Head acceptencoding = Requestmethod.getrequestheader ("Accept Code");
if (! NULL = acceptencoding
。 && Acceptencoding.getvalue () included ("gzip"))
{
Dataencrypt = TRUE;
}

The InputStream responsestream = NULL;
Try
{
int status = Httpclient.executemethod (Requestmethod);
if (Httpstatus.sc_ok = = status)
{
Responsestream = Requestmethod.getresponsebodyasstream ();
Responsecontent = Getcontentbystream (Dataencrypt new Gzipinputstream (responsestream): Responsestream, coded);
Responsestream.close ();
}
When the return code is 301302303307, it means that the page has been redirected, and the URL of the location is re-requested, which is important when some logins are authorized to fetch cookies.
Otherwise, if (httpstatus.sc_moved_permanently = = State
|| httpstatus.sc_moved_temporarily = = Status
|| Httpstatus.sc_see_other = = Status
|| Httpstatus.sc_temporary_redirect = = status)
{
Read the new URL address
Header hoisting = Requestmethod.getresponseheader ("position");
if (! Header = NULL)
{
The RedirectURL of the string = Header.getvalue ();
if (0 = the redirecturl!
&& false = = Redirecturl.isempty ())
{
Responsecontent = invalid;
if (empty = = of RedirectURL | | Redirecturl.isempty ())
{
RedirectURL = "/";
}

if (false = = Redirecturl.startswith ("http//")
! && NULL = website)
{
if (Website.startswith ("/"))
{
RedirectURL = website + redirecturl;
}
Other
{
RedirectURL = website + "/" + RedirectURL;
}
}

GetMethod REDIRECT = new Implementation GetMethod (RedirectURL);
Head referral = requestmethod.getrequestheader ("Referral");
if (null = referrer!) )
{
Redirect.addrequestheader (referral);
}
The cookie of the head = Requestmethod.getrequestheader ("cookie");
if (empty = biscuit!) )
{
Redirect.addrequestheader (biscuit);
}
Status = Httpclient.executemethod (redirected);
if (Httpstatus.sc_ok = = status)
{
Responsestream = Redirect.getresponsebodyasstream ();
Responsecontent = Getcontentbystream (responsestream, coded);
Responsestream.close ();
}
}

}//End head

}//End status

} catch (Exception five)
{
E.printstacktrace ();
} finally
{
if (requestmethod! = NULL)
{
Requestmethod.releaseconnection ();
}
}
return to Responsecontent;
}

/ **
* Reads information from the stream according to the specified encoding
*
* @ parameter Instream
* Parameter Code
* @ Back
* Trigger IOException
* /
Common static string Getcontentbystream (in InputStream instream, string encoding) throws a IOException exception
{
if (empty = = in-stream AD)
{
returns null;
}

StringBuilder content = new StringBuilder ();
Reads stream content in the specified encoding format
BufferedReader reader = new BufferedReader (new InputStreamReader (in-stream advertising, coding));
String message = NULL;
and (empty = (message = Reader.readline ())! )
{
Content.append (message);
Content.append ("\ r \ n");
}
Close the reader and release resources
Reader.close ();
Return (content.tostring ());
}

/ **
* Internal class, inherited from Postmethod, used to specify the postal request encoding format
* /
Postmethod of public static class Encodepostmethod extension
{
private string encoding = NULL;

Public Encodepostmethod (URL string, string encoding)
{
Super (URL);
This.encode = encoding;
}

@ Overwrite
Common String Getrequestcharset ()
{
Todo automatically generate method stubs
return (This.encode);
}

}

/ **
* Test
*
* @ parameter args
* /
public static Invalid main (string [] args)
{
System.setproperty ("Http.proxyhost", "165.228.128.10");
System.setproperty ("Http.proxyport", "3128");
System.setproperty ("Http.proxyset", "true");


String requrl = "http://news.39.net/jbyw/index.html";
Requrl = "http://news.39.net/a/2010722/1404231.html";
Map <string, string > title = new HashMap <string, String > ();
Headers.put ("Accept Code", "gzip, put gas");

HttpClient of httpclient = Gethttpclient ();
String htmlcontent = DoRequest (httpclient, Requrl, head, empty, "GBK");
System.out.println (htmlcontent);

}
}

  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.