I opened a blog in the blog Park, and shared my own things. I hope I can stick to it.
The first article is actually a little thing, that is, the Unicode escape of strings. Sometimes, when processing some network information, it is inevitable to perform Unicode escape for Chinese characters, that is, it becomes \ u4e2d \ u6587 to Facilitate network transmission. To facilitate encoding and decoding, we wrote two functions to do this.
The first is the hexadecimal character judgment. I put it into a separate class to facilitate reuse. The function is very simple, that is, to determine whether the character is within the range of [0-9a-fa-f. Here is a very small technique, that is, to slightly tune the order of judgment in the order of the binary tree, which can slightly increase the speed.
Using system. Text; namespace cyjb {// <summary> // provides an extension method for the <see CREF = "system. Char"/> class. /// </Summary> Public static class charext {# region ishex // <summary> // indicates whether the specified Unicode Character belongs to the hexadecimal number category. /// </Summary> /// <Param name = "ch"> UNICODE character to be calculated. </Param> /// <returns> If <paramref name = "ch"/> is a decimal number, it is <C> true </C>; // otherwise, is <C> false </C>. </Returns> Public static bool ishex (this char ch) {If (CH <= 'F') {If (CH> = 'A ') {return ch <= 'F' | ch> = 'a';} else {return ch> = '0' & Ch <= '9 ';}} return false;} // <summary> // indicates whether the character at the specified position in the specified string belongs to the hexadecimal number category. /// </Summary> /// <Param name = "str"> a string. </Param> // <Param name = "Index"> the position of the character to be calculated in <paramref name = "str"/>. </Param> // <returns> If the character in <paramref name = "str"/> is at <paramref name = "Index"/>, /// it is <C> true </C>. Otherwise, it is <C> false </C>. </Returns> // <exception CREF = "system. indexoutofrangeexception"> <paramref name = "Index"/> is greater than or equal to the length of the string or less than zero. </Exception> Public static bool ishex (string STR, int index) {return ishex (STR [Index]); }# endregion // ishex }}
Then it is the Unicode encoding and decoding method for the string.
The decoding method supports escape of \ x, \ U, and \ U, where \ x can be followed by 1 ~ 4 hexadecimal characters, \ U followed by 4 hexadecimal characters, \ U followed by 8, and because. net only supports Unicode less than 0x10ffff, so when \ U is used for escape, the excess part will be discarded. If the preceding conditions are not met, no escaping or error is returned.
Encoding will display characters (0x20 ~ 0x7e, that is, from space ~) All other characters are represented by \ U escape.
Using system. Text; namespace cyjb {// <summary> // provides an extension method for the <see CREF = "system. String"/> class. /// </Summary> Public static class stringext {# region Unicode operation // <summary> /// set \ U in the string, the escape characters of \ U and \ x are converted to corresponding characters. /// </Summary> /// <Param name = "str"> string to be converted. </Param> // <returns> the converted string. </Returns> Public static string decodeunicode (this string Str) {If (string. isnullorempty (STR) {return STR;} int idx = Str. indexof ('\'); If (idx <0) {return STR;} int Len = Str. length, start = 0; stringbuilder builder = new stringbuilder (LEN); While (idx> = 0) {// Add the string before the current. If (idx> Start) {builder. append (STR, start, idx-Start); Start = idx;} // skip the '\' character. Idx ++; // The number of characters after the '\' character is less than 2. It cannot be an escape character and is directly returned. If (idx + 1> = Len) {break;} // length of hexadecimal characters. Int hexlen = 0; // process Unicode escape. Switch (STR [idx]) {Case 'X': // \ x can be followed by 1 to 4 digits. Hexlen = gethexlength (STR, idx + 1, 4); break; Case 'U': // \ u must be followed by four digits. If (idx + 4 <Len & gethexlength (STR, idx + 1, 4) = 4) {hexlen = 4 ;}else {hexlen = 0 ;} break; case 'U': // \ u must be followed by eight digits. If (idx + 8 <Len & gethexlength (STR, idx + 1, 8) = 8) {hexlen = 8 ;}else {hexlen = 0 ;} break ;} if (hexlen> 0) {idx ++; int charnum = int. parse (Str. substring (idx, hexlen), numberstyles. hexnumber, cultureinfo. invariantculture); If (charnum <0 xFFFF) {// a single character. Builder. append (char) charnum);} characters of the else {// proxy pair. Builder. append (char. convertfromutf32 (charnum & 0x1fffff);} idx = Start = idx + hexlen;} idx = Str. indexof ('\', idx);} // Add the remaining string. If (start <Len) {builder. append (Str. substring (start);} return builder. tostring () ;}//< summary> /// returns the number of hexadecimal characters after the specified index position of the string. /// </Summary> /// <Param name = "str"> string that gets the number of hexadecimal characters. </Param> /// <Param name = "Index"> calculate the actual index of the number of hexadecimal characters. </Param> /// <Param name = "maxlength"> the maximum number of hexadecimal characters required. </Param> /// <returns> the actual number of hexadecimal characters. </Returns> internal static int gethexlength (string STR, int index, int maxlength) {If (index + maxlength> Str. length) {maxlength = Str. length-index ;}for (INT I = 0; I <maxlength; I ++, index ++) {If (! Charext. ishex (STR, index) {return I ;}return maxlength ;}/// <summary> // do not display characters in the string (0x00 ~ 0x1f, after 0x7f) escape to \ U format, in which hexadecimal format is output in uppercase letters. /// </Summary> /// <Param name = "str"> string to be converted. </Param> // <returns> the converted string. </Returns> Public static string encodeunicode (this string Str) {return encodeunicode (STR, true );} /// <summary> /// do not display characters (0x00 ~ 0x1f, after 0x7f) escape to \ U form. /// </Summary> /// <Param name = "str"> string to be converted. </Param> /// <Param name = "uppercase"> whether to output hexadecimal format in uppercase letters. If it is <C> true </C>, the hexadecimal format is output in uppercase letters. // otherwise, the hexadecimal format is output in lowercase letters. </Param> // <returns> the converted string. </Returns> Public static string encodeunicode (this string STR, bool uppercase) {If (string. isnullorempty (STR) {return STR;} string format = uppercase? "X4": "X4"; stringbuilder builder = new stringbuilder (Str. length * 2); For (INT I = 0; I <Str. length; I ++) {char c = STR [I]; If (C> = ''& C <= '~ ') {// Can display characters. Builder. append (c);} else {builder. append ("\ U"); builder. append (INT) c ). tostring (format, cultureinfo. invariantculture);} return builder. tostring () ;}# endregion }}
Code shows the charext and stringext classes in the https://github.com/CYJB/Cyjb/tree/master/Cyjb.