Public static string getcontent (string URL) throws exception {httpclient Hc = new httpclient (); httpmethod Hm = new getmethod (URL); int statuscode =-1; byte [] result = NULL; statuscode = hc.exe cutemethod (HM); If (statuscode! = Httpstatus. SC _ OK) // return ""; if (Hm. getresponsebody ()! = NULL) {// obtain page data result = HM. getresponsebody (); // HM. getstatusline ()-HTTP status and request result} string charset = jsouputils. getcharset (URL); // get charsethm of the page through jsoup. releaseconnection (); string data = NULL; If (result! = NULL) Data = new string (result, charset); // character encoding setting return data ;}
[Code]Get Character Set
/*** Get Character Set */public static string getcharset (string siteurl) throws exception {URL url = new URL (siteurl); document DOC = jsoup. parse (URL, 6*1000); elements eles = Doc. select ("meta [http-equiv = Content-Type]"); iterator <element> itor = eles. iterator (); While (itor. hasnext () return regularutils. matchcharset (itor. next (). tostring (); Return "gb2312 ";}
[Code]Use regular expressions to obtain the page characters
/*** Get the page character */public static string matchcharset (string content) {string CHS = "gb2312"; P = pattern. Compile ("(? <= Charset =) (. + )(? = \ ")"); Matcher M = P. matcher (content); If (M. Find () return M. Group (); Return CHS ;}