Various Java string encoding and conversion
Import java. Io. unsupportedencodingexception;
/**
* Encode the conversion string
*/
Public class changecharset {
/** 7-bit ASCII characters, also known as the basic Latin block of the ISO646-US and Unicode Character Set */
Public static final string us_ascii = "US-ASCII ";
/** ISO Latin alphabet No.1, also known as ISO-LATIN-1 */
Public static final string iso_8859_1 = "ISO-8859-1 ";
/** Convert 8-bit UCS */
Public static final string utf_8 = "UTF-8 ";
/** 16-bit UCS conversion format, big endian (the lowest address stores high byte) byte order */
Public static final string utf_16be = "UTF-16BE ";
/** 16-bit UCS conversion format, little-Endian (the highest address stores low byte) byte order */
Public static final string utf_16le = "UTF-16LE ";
/** The 16-bit UCS conversion format. The byte sequence is identified by optional byte sequence tags */
Public static final string utf_16 = "UTF-16 ";
/** Chinese Character Set */
Public static final string GBK = "GBK ";
/**
* Convert character encoding into US-ASCII code
*/
Public String toascii (string Str) throws unsupportedencodingexception {
Return this. changecharset (STR, us_ascii );
}
/**
* Convert character encoding into ISO-8859-1 code
*/
Public String toiso_8859_1 (string Str) throws unsupportedencodingexception {
Return this. changecharset (STR, iso_8859_1 );
}
/**
* Convert character encoding into UTF-8 code
*/
Public String toutf_8 (string Str) throws unsupportedencodingexception {
Return this. changecharset (STR, utf_8 );
}
/**
* Convert character encoding into UTF-16BE code
*/
Public String toutf_16be (string Str) throws unsupportedencodingexception {
Return this. changecharset (STR, utf_16be );
}
/**
* Convert character encoding into UTF-16LE code
*/
Public String toutf_16le (string Str) throws unsupportedencodingexception {
Return this. changecharset (STR, utf_16le );
}
/**
* Convert character encoding into UTF-16 code
*/
Public String toutf_16 (string Str) throws unsupportedencodingexception {
Return this. changecharset (STR, utf_16 );
}
/**
* Convert character encoding to GBK code
*/
Public String togbk (string Str) throws unsupportedencodingexception {
Return this. changecharset (STR, GBK );
}
/**
* Implementation of string encoding conversion
* @ Param STR string to be converted
* @ Param newcharset destination Encoding
* @ Return
* @ Throws unsupportedencodingexception
*/
Public String changecharset (string STR, string newcharset)
Throws unsupportedencodingexception {
If (STR! = NULL ){
// Use the default character encoding to decode the string.
Byte [] BS = Str. getbytes ();
// Generate a string encoded with a new character
Return new string (BS, newcharset );
}
Return NULL;
}
/**
* Implementation of string encoding conversion
* @ Param STR string to be converted
* @ Param oldcharset original Encoding
* @ Param newcharset destination Encoding
* @ Return
* @ Throws unsupportedencodingexception
*/
Public String changecharset (string STR, string oldcharset, string newcharset)
Throws unsupportedencodingexception {
If (STR! = NULL ){
// Encode and decode the string with the old character. An exception may occur during decoding.
Byte [] BS = Str. getbytes (oldcharset );
// Generate a string encoded with a new character
Return new string (BS, newcharset );
}
Return NULL;
}
Public static void main (string [] ARGs) throws unsupportedencodingexception {
Changecharset test = new changecharset ();
String STR = "this is a Chinese string! ";
System. Out. println ("str:" + Str );
String GBK = test. togbk (STR );
System. Out. println ("converted to GBK Code:" + GBK );
System. Out. println ();
String ASCII = test. toascii (STR );
System. Out. println ("converted to US-ASCII:" + ASCII );
GBK = test. changecharset (ASCII, changecharset. us_ascii, changecharset. GBK );
System. Out. println ("convert the ASCII code string to GBK:" + GBK );
System. Out. println ();
String iso88591 = test. toiso_8859_1 (STR );
System. Out. println ("converted to ISO-8859-1 Code:" + iso88591 );
GBK = test. changecharset (iso88591, changecharset. iso_8859_1, changecharset. GBK );
System. Out. println ("then convert the ISO-8859-1 code string into GBK Code:" + GBK );
System. Out. println ();
String utf8 = test. toutf_8 (STR );
System. Out. println ("convert to UTF-8 Code:" + utf8 );
GBK = test. changecharset (utf8, changecharset. utf_8, changecharset. GBK );
System. Out. println ("then convert the UTF-8 code string into GBK Code:" + GBK );
System. Out. println ();
String utf16be = test. toutf_16be (STR );
System. Out. println ("convert to UTF-16BE Code:" + utf16be );
GBK = test. changecharset (utf16be, changecharset. utf_16be, changecharset. GBK );
System. Out. println ("then convert the UTF-16BE code string into GBK Code:" + GBK );
System. Out. println ();
String utf16le = test. toutf_16le (STR );
System. Out. println ("convert to UTF-16LE Code:" + utf16le );
GBK = test. changecharset (utf16le, changecharset. utf_16le, changecharset. GBK );
System. Out. println ("then convert the UTF-16LE code string into GBK Code:" + GBK );
System. Out. println ();
String UTF16 = test. toutf_16 (STR );
System. Out. println ("convert to UTF-16 Code:" + UTF16 );
GBK = test. changecharset (UTF16, changecharset. utf_16le, changecharset. GBK );
System. Out. println ("then convert the UTF-16 code string into GBK Code:" + GBK );
String S = new string ("Chinese". getbytes ("UTF-8"), "UTF-8 ");
System. Out. println (s );
}
}
Bytes ------------------------------------------------------------------------------------------------------------------
The string class in Java is encoded according to Unicode. When string (byte [] bytes, string encoding) is used to construct a string, encoding indicates that the data in bytes is encoded in that way, rather than the encoding method of the final string. In other words, enables the system to convert bytes data from encoding to unicode encoding. If this parameter is not specified, JDK determines the bytes encoding method based on the operating system.
When we read data from a file, it is best to use the inputstream method, and then use string (byte [] bytes, string encoding) to specify the file encoding method. Do not use reader because reader automatically converts the file content to unicode encoding Based on the encoding method specified by JDK.
When we read text data from the database, we use the resultset. getbytes () method to obtain the byte array. We can also use the encoded string construction method.
Resultset RS;
Bytep [] bytes = Rs. getbytes ();
String STR = new string (bytes, "gb2312 ");
Do not take the following steps.
Resultset RS;
String STR = Rs. getstring ();
STR = new string (Str. getbytes ("iso8859-1"), "gb2312 ");
The efficiency of this encoding conversion method is low. The reason for doing so is that when the resultset is executed by the getstring () method, the data encoding method in the database is iso8859-1 by default. The system converts the data to Unicode according to the encoding method of the iso8859-1. Use Str. getbytes ("iso8859-1") to restore the data, and then use new string (bytes, "gb2312") to convert the data from gb2312 to Unicode, there are many steps in the middle.
When reading parameters from httprequest, you can use the reqeust. setcharacterencoding () method to set the encoding method. The read content is correct.