In Java, inputstream and outputstream are read in bytes, while reader and writer are read and written in characters. The following example simulates reader to directly read UTF-8 encoded characters:
Public class utf8reader {private inputstream; // 10000000 Private Static int back10head = 127; // 11000000 Private Static int back110head = 191; // 11100000 get the reverse Private Static int back1110head = 223; // 3 bytes indicate the character byte [] threebytes = new byte [3]; // two bytes indicate byte [] twobytes = new byte [2]; Public utf8reader (inputstream) {This. inputstream = inputstream;}/*** read a character * @ return * @ throws ioexception */Public int readchar () throws ioexception {// read a byte int READ = inputstream. read (); // a byte that starts with 1110 represents a single character if (read> = 224) {threebytes [0] = (byte) read with three bytes; // read the next two bytes of inputstream. read (threebytes, 1, 2); // convert the three bytes into the character return parsethreebyte (threebytes); // The Byte starts with 110, is to use 2 bytes to represent a character} else if (read> = 192) {twobytes [0] = (byte) read; // read inputstream for the next season. read (twobytes, 1, 1); // convert two bytes to the return parsetwobyte (twobytes) character; // The Byte starts with 10 and can only be one of multiple bytes, cannot be the header} else if (read> = 128) {Throw new ioexception ("illegal encoding [" + read + "] character starts with 10 "); // an ascii code or file Terminator-1 is returned directly.} else if (read >=0) {return read;} else {return-1 ;}} /*** convert two bytes to one character * convert the value of 110 XXXXX 10 xxxxxx bytes to the XXXX xxxxxx character * @ Param twobytes2 * @ return */private int parsetwobyte (byte [] bytes) {// remove the two-byte header to indicate int head = bytes [0] & back110head; // shift 6-Bit Head = head to the right <6; // remove the header of the component to indicate int tail = bytes [1] & back10head; Return (char) (Head | tail );} /*** convert the three bytes into one character * convert the value of the 1110 XXXX 10 xxxxxx 10xxxxxx byte to the xxx xxxxxx character * @ Param threebytes2 * @ return */private int parsethreebyte (byte [] bytes) {// remove the Three-byte header to indicate int head = bytes [0] & back1110head; // shift 12-Bit Head = head to the right <12; // remove the header of the component to indicate int second = bytes [1] & back10head; int third = bytes [2] & back10head; // The second character shifts to the right six-digit second = Second <6; return head | second | third ;}}
UTF-8 encoding