In the previous test of UTF-8 encoding, the boost. locale library can be used to convert the UTF-8 encoding of a character into Unicode Code Point.
I read some code of locale today. The author Artyom suggested using the utf_to_utf function to get everything done. My requirement is to parse all the Unicode code points of the string. It seems that the utf_to_utf name is not suitable.
So I modified some of Artyom's code. Now let's take a look at the calling code of Main. CC:
# Include "test. H "# include" util/endian. H "# include" util/UTF. H "# include <iostream> using namespace STD; int main (INT argc, char ** argv) {// test (3> 2 ); char const * P = "1"; cout <printstringasbinarystring (p) <Endl; string STR = "1, 2, 3"; cout <printstringasbinarystring (STR) <Endl; string:: iterator itor = Str. begin (); vector <code_point> points; utf8tounicode (itor, str. end (), points); cout <"code point0: 0x" <STD: Hex <points [0] <"binary format: B "<printintasbinarystring (points [0]) <Endl; cout <" code point1: 0x "<STD :: hex <points [1] <"binary format: B" <printintasbinarystring (points [1]) <Endl; cout <"code point2: 0x "<STD: Hex <points [2] <" binary format: B "<printintasbinarystring (points [2]) <Endl ;}
The running result string "one, two, three" is printed as follows:
code point0: 0x4e00 binary format:B00000000000000000100111000000000code point1: 0x4e8c binary format:B00000000000000000100111010001100code point2: 0x4e09 binary format:B00000000000000000100111000001001
All the implementation code is in UTF. h:
#ifndef UTIL_UTF_H_#define UTIL_UTF_H_#include "util/endian.h"#include "util/unicode_error.h"#include <boost/locale/utf.hpp>using namespace boost::locale::utf;string PrintStringAsBinaryString(char const* p) { stringstream stream; for (size_t i = 0; i < strlen(p); ++i) { stream << PrintIntAsBinaryString(p[i]); stream << " "; } return stream.str();}string PrintStringAsBinaryString(string const& str) { stringstream stream; for (size_t i = 0; i < str.size(); ++i) { stream << PrintIntAsBinaryString(str[i]); stream << " "; } return stream.str();}struct ParseResult { code_point point; size_t size;};int trail_length(char ci) { unsigned char c = ci; if(c < 128) return 0; if(BOOST_LOCALE_UNLIKELY(c < 194)) return -1; if(c < 224) return 1; if(c < 240) return 2; if(BOOST_LOCALE_LIKELY(c <=244)) return 3; return -1;}int width(code_point value) { if(value <=0x7F) { return 1; } else if(value <=0x7FF) { return 2; } else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) { return 3; } else { return 4; }}bool is_trail(char ci) { unsigned char c = ci; return (c & 0xC0) == 0x80;}bool is_lead(char ci) { return !is_trail(ci);}// Convert the UTF-8 string into template<typename Iterator>void ParseUTF8(Iterator &p, Iterator e, ParseResult& result) { if (BOOST_LOCALE_UNLIKELY(p == e)) { throw UnicodeError("ParseUTF8 failed"); } unsigned char lead = *p++; // First byte is fully validated here int trail_size = trail_length(lead); if(BOOST_LOCALE_UNLIKELY(trail_size < 0)) { throw UnicodeError("ParseUTF8 failed"); } // // Ok as only ASCII may be of size = 0 // also optimize for ASCII text // if(trail_size == 0) { result.point = lead; result.size = 1; return; } code_point c = lead & ((1<<(6-trail_size))-1); // Read the rest unsigned char tmp; switch(trail_size) { case 3: if(BOOST_LOCALE_UNLIKELY(p==e)) { throw UnicodeError("ParseUTF8 failed"); } tmp = *p++; c = (c << 6) | ( tmp & 0x3F); case 2: if(BOOST_LOCALE_UNLIKELY(p==e)) { throw UnicodeError("ParseUTF8 failed"); } tmp = *p++; c = (c << 6) | ( tmp & 0x3F); case 1: if(BOOST_LOCALE_UNLIKELY(p==e)) { throw UnicodeError("ParseUTF8 failed"); } tmp = *p++; c = (c << 6) | ( tmp & 0x3F); } // Check code point validity: no surrogates and // valid range if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) { throw UnicodeError("ParseUTF8 failed"); } // make sure it is the most compact representation if(BOOST_LOCALE_UNLIKELY(width(c) != trail_size + 1)) { throw UnicodeError("ParseUTF8 failed"); } result.point = c; result.size = trail_size + 1;}// Convert the UTF-8 string that represent one single Unicode character in [start, end) to Unicode code pointtemplate<typename Iterator>code_point UTF8ToUnicode(Iterator &start, Iterator end) { ParseResult result; ParseUTF8(start, end, result); return result.point;}template<typename Iterator>code_point UTF8ToUnicode(Iterator &start, Iterator end, vector<code_point>& points) { ParseResult result; Iterator begin = start; while (begin < end) { ParseUTF8(start, end, result); points.push_back(result.point); begin += result.size; }}#endif
I have not studied wstring. But in general, string is enough for me, understand the Unicode and UTF-8 encoding, the understanding of the system knowledge is greatly improved.