Httpclient crawls webpage source code

Source: Internet
Author: User

Package util;

Import java. Io. bufferedreader;
Import java. Io. ioexception;
Import java. Io. inputstream;
Import java. Io. inputstreamreader;
Import java. Text. dateformat;
Import java. Text. simpledateformat;
Import java. util. arraylist;
Import java. util. date;
Import java. util. hashmap;
Import java. util. List;
Import java. util. Map;
Import java. util. Set;
Import java. util. Map. entry;
Import java.util.zip. gzipinputstream;

Import org. Apache. commons. httpclient. header;
Import org. Apache. commons. httpclient. httpclient;
Import org. Apache. commons. httpclient. httpexception;
Import org. Apache. commons. httpclient. httpmethod;
Import org. Apache. commons. httpclient. httpstatus;
Import org. Apache. commons. httpclient. namevaluepair;
Import org. Apache. commons. httpclient. simplehttpconnectionmanager;
Import org. Apache. commons. httpclient. Methods. getmethod;
Import org. Apache. commons. httpclient. Methods. postmethod;
Import org. Apache. commons. httpclient. Params. httpconnectionmanagerparams;
Import org. Apache. commons. httpclient. Params. httpmethodparams;

/**
* @ Author Liuwei
* Date: Dec 18,200 9
*
* Todo
* Httpclient auxiliary class
*/
Public class httpclienthelper
{

/**
* Httpclient connection timeout and data read timeout (unit: milliseconds)
*/
Public static final int httpclient_connection_timeout = 30000;
Public static final int httpclient_so_timeout = 120000;
Public static final int httpmethod_so_timeout = 5000;

// Enable connectionmanager to disable connections when managing httpclientconnection
Private Static Boolean alwaysclose = false;
Private Static string defaultencode = "UTF-8 ";

Private Static final dateformat date_format = new simpledateformat ("yyyy-mm-dd hh: mm: SS ");

/**
* Obtain the httpclient connection and set relevant parameters.
*
* @ Return
*/
Public static httpclient gethttpclient ()
{
Httpclient client = new httpclient (New simplehttpconnectionmanager (alwaysclose ));
Httpconnectionmanagerparams managerparams = client. gethttpconnectionmanager (). getparams ();
// Set the connection timeout (unit: milliseconds)
Managerparams. setconnectiontimeout (httpclient_connection_timeout );
// Set the read data timeout (unit: milliseconds)
Managerparams. setsotimeout (httpclient_so_timeout );
Return client;
}

/**
* Obtain the httpclient connection and set relevant parameters.
*
* @ Param logonsite
* @ Param logonport
* @ Param Protocol
* @ Return
*/
Public static httpclient gethttpclient (final string logonsite, final int logonport, final string Protocol)
{
Httpclient client = new httpclient (New simplehttpconnectionmanager (alwaysclose ));
Client. gethostconfiguration (). sethost (logonsite, logonport, Protocol );
Httpconnectionmanagerparams managerparams = client. gethttpconnectionmanager (). getparams ();
// Set the connection timeout (unit: milliseconds)
Managerparams. setconnectiontimeout (httpclient_connection_timeout );
// Set the read data timeout (unit: milliseconds)
Managerparams. setsotimeout (httpclient_so_timeout );
Return client;
}

Private Static list {
List Boolean required deuseragent = false;
If (null! = Header & false = header. isempty ())
{
Set <entry <string, string> entryset = header. entryset ();
For (Entry <string, string> entry: entryset)
{
If (false = includeuseragent
& "User-Agent". Equals (entry. getkey ()))
{
Includeuseragent = true;
}
Headers. Add (new header (entry. getkey (), entry. getvalue ()));
}
}

If (false = includeuseragent)
{
Headers. Add (new header (
"User-Agent ",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; gtb5;. Net CLR 1.1.4322;. Net CLR 2.0.50727; Alexa toolbar; Maxthon 2.0 )"));
}
Return headers;
}

Private Static namevaluepair [] getpairs (Map <string, string> postdata)
{
If (null = postdata | postdata. isempty ())
{
Return NULL;
}

Set <entry <string, string> entryset = postdata. entryset ();
Int datalength = entryset. Size ();
Namevaluepair [] pairs = new namevaluepair [datalength];
Int I = 0;
For (Entry <string, string> entry: entryset)
{
Pairs [I ++] = new namevaluepair (entry. getkey (), entry. getvalue ());
}
Return pairs;
}

/**
* Request webpage content information
*
* @ Param httpclient
* @ Param requrl
* @ Param Header
* @ Param postdata
* @ Param encode
* @ Return
*/
Public static string dorequest (httpclient, string requrl,
Map <string, string> header, Map <string, string> postdata, string encode)
{
String htmlcontent = NULL;
If (null = httpclient)
{
Return htmlcontent;
}

// Request Encoding settings
Encode = (null = encode? Defaultencode: encode );

// Header request information
List

System. Out. println ("[" + date_format.format (new date () + "] -- dorequest --" + requrl );

// POST method
If (null! = Postdata)
{
Postmethod = new encodepostmethod (requrl, encode );
For (header tempheader: headers)
{
Postmethod. setRequestHeader (tempheader );
}

// Post parameter settings
Namevaluepair [] Params = getpairs (postdata );
If (null! = Params)
{
Postmethod. setrequestbody (Params );
}

// Extract webpage content
Htmlcontent = executemethod (httpclient, postmethod, encode, getwebsite (requrl ));
}
Else
{
Getmethod = new getmethod (requrl );
For (header tempheader: headers)
{
Getmethod. setRequestHeader (tempheader );
}

// Extract webpage content
Htmlcontent = executemethod (httpclient, getmethod, encode, null );
}
Return htmlcontent;
}

Private Static string getwebsite (string requrl)
{
String website = NULL;
If (null = requrl | requrl. isempty ())
{
Return website;
}

String prefix = "http ://";
If (requrl. startswith (prefix ))
{
Int Index = requrl. substring (prefix. Length (). indexof ("/") + prefix. Length ();
Website = requrl. substring (0, index );
}
Return website;
}

/**
* Use httpmethod to obtain webpage content
*
* @ Param httpclient
* @ Param requestmethod
* @ Param encode
* @ Param website
* @ Return
*/
Private Static string executemethod (httpclient, httpmethod requestmethod, string encode, string website)
{
String responsecontent = NULL;
If (null = httpclient)
{
Return responsecontent;
}

// Determine whether to request encrypted data
Boolean dataencrypt = false;
Header acceptencoding = requestmethod. getrequestheader ("Accept-encoding ");
If (null! = Acceptencoding
& Acceptencoding. getvalue (). Contains ("gzip "))
{
Dataencrypt = true;
}

Inputstream responsestream = NULL;
Try
{
Int status = httpclient.exe cutemethod (requestmethod );
If (httpstatus. SC _ OK = Status)
{
Responsestream = requestmethod. getresponsebodyasstream ();
Responsecontent = getcontentbystream (dataencrypt? New gzipinputstream (responsestream): responsestream, encode );
Responsestream. Close ();
}
// When the return code is 301,302,303,307, it indicates that the page has been redirected, And the location URL is requested again, which is important for some login authorization to retrieve cookies.
Else if (httpstatus. SC _moved_permanently = Status
| Httpstatus. SC _moved_temporarily = Status
| Httpstatus. SC _see_other = Status
| Httpstatus. SC _temporary_redirect = Status)
{
// Read the new URL
Header header = requestmethod. getResponseHeader ("location ");
If (header! = NULL)
{
String redirecturl = header. getvalue ();
If (null! = Redirecturl
& False = redirecturl. isempty ())
{
Responsecontent = NULL;
If (null = redirecturl | redirecturl. isempty ())
{
Redirecturl = "/";
}

If (false = redirecturl. startswith ("http ://")
& Null! = Website)
{
If (website. startswith ("/"))
{
Redirecturl = website + redirecturl;
}
Else
{
Redirecturl = website + "/" + redirecturl;
}
}

Getmethod redirect = new getmethod (redirecturl );
Header Referer = requestmethod. getrequestheader ("Referer ");
If (null! = Referer)
{
Redirect. addrequestheader (Referer );
}
Header cookie = requestmethod. getrequestheader ("cookie ");
If (null! = Cookie)
{
Redirect. addrequestheader (cookie );
}
Status = httpclient.exe cutemethod (redirect );
If (httpstatus. SC _ OK = Status)
{
Responsestream = redirect. getresponsebodyasstream ();
Responsecontent = getcontentbystream (responsestream, encode );
Responsestream. Close ();
}
}

} // End-headers

} // End-status

} Catch (exception E)
{
E. printstacktrace ();
} Finally
{
If (requestmethod! = NULL)
{
Requestmethod. releaseconnection ();
}
}
Return responsecontent;
}

/**
* Read information from the stream according to the specified Encoding
*
* @ Param instream
* @ Param encode
* @ Return
* @ Throws ioexception
*/
Public static string getcontentbystream (inputstream instream, string encode) throws ioexception
{
If (null = instream)
{
Return NULL;
}

Stringbuilder content = new stringbuilder ();
// Read the stream content in the specified encoding format
Bufferedreader reader = new bufferedreader (New inputstreamreader (instream, encode ));
String message = NULL;
While (null! = (Message = reader. Readline ()))
{
Content. append (Message );
Content. append ("\ r \ n ");
}
// Close the reader and release the resource
Reader. Close ();
Return (content. tostring ());
}

/**
* Internal class, inherited from postmethod, used to specify the POST Request Encoding format
*/
Public static class encodepostmethod extends postmethod
{
Private string encode = NULL;

Public encodepostmethod (string URL, string encode)
{
Super (URL );
This. encode = encode;
}

@ Override
Public String getrequestcharset ()
{
// Todo auto-generated method stub
Return (this. encode );
}

}

/**
* Test
*
* @ Param ARGs
*/
Public static void main (string [] ARGs)
{
// System. setproperty ("HTTP. proxyhost", "165.228.128.10 ");
// System. setproperty ("HTTP. proxyport", "3128 ");
// System. setproperty ("HTTP. proxyset", "true ");

String requrl = "http://news.39.net/jbyw/index.html ";
Requrl = "http://news.39.net/a/2010722/1404231.html ";
Map <string, string> headers = new hashmap <string, string> ();
Headers. Put ("Accept-encoding", "gzip, deflate ");

Httpclient = gethttpclient ();
String htmlcontent = dorequest (httpclient, requrl, headers, null, "GBK ");
System. Out. println (htmlcontent );

}
}

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.