How to judge character sets using Windows notepad

Source: Internet
Author: User
/* IsTextUTF8
 *
 * UTF-8 is the encoding of Unicode based on Internet Society RFC2279
 * ( See http://www.cis.ohio-state.edu/htbin/rfc/rfc2279.html )
 *
 * Basicly:
 * 0000 0000-0000 007F - 0xxxxxxx  (ascii converts to 1 octet!)
 * 0000 0080-0000 07FF - 110xxxxx 10xxxxxx    ( 2 octet format)
 * 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
 * (this keeps going for 32 bit unicode) 
 * 
 *
 * Return value:  TRUE, if the text is in UTF-8 format.
 *                FALSE, if the text is not in UTF-8 format.
 *                We will also return FALSE is it is only 7-bit ascii, so the right code page
 *                will be used.
 *
 *                Actually for 7 bit ascii, it doesn't matter which code page we use, but
 *                notepad will remember that it is utf-8 and "save" or "save as" will store
 *                the file with a UTF-8 BOM.  Not cool.
 */


INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen )
{
    INT   i;
    DWORD cOctets;  // octets to go in this UTF-8 encoded character
    UCHAR chr;
    BOOL  bAllAscii= TRUE;

    cOctets= 0;
    for( i=0; i < iLen; i++ ) {
        chr= *(lpstrInputStream+i);

        if( (chr&0x80) != 0 ) bAllAscii= FALSE;

        if( cOctets == 0 )  {
            //
            // 7 bit ascii after 7 bit ascii is just fine.  Handle start of encoding case.
            //
            if( chr >= 0x80 ) {  
               //
               // count of the leading 1 bits is the number of characters encoded
               //
               do {
                  chr <<= 1;
                  cOctets++;
               }
               while( (chr&0x80) != 0 );

               cOctets--;                        // count includes this character
               if( cOctets == 0 ) return FALSE;  // must start with 11xxxxxx
            }
        }
        else {
            // non-leading bytes must start as 10xxxxxx
            if( (chr&0xC0) != 0x80 ) {
                return FALSE;
            }
            cOctets--;                           // processed another octet in encoding
        }
    }

    //
    // End of text.  Check for consistency.
    //

    if( cOctets > 0 ) {   // anything left over at the end is an error
        return FALSE;
    }

    if( bAllAscii ) {     // Not utf-8 if all ascii.  Forces caller to use code pages for conversion
        return FALSE;
    }

    return TRUE;
}


/* IsInputTextUnicode
 * Verify if the input stream is in Unicode format.
 *
 * Return value:  TRUE, if the text is in Unicode format.
 *
 * 29 June 1998          
 */


INT IsInputTextUnicode  (LPSTR lpstrInputStream, INT iLen)
{
    INT  iResult= ~0; // turn on IS_TEXT_UNICODE_DBCS_LEADBYTE
    BOOL bUnicode;

    // We would like to check the possibility
    // of IS_TEXT_UNICODE_DBCS_LEADBYTE.
    //

    bUnicode= IsTextUnicode( lpstrInputStream, iLen, &iResult);

    if (bUnicode                                         &&
       ((iResult & IS_TEXT_UNICODE_STATISTICS)    != 0 ) &&
       ((iResult & (~IS_TEXT_UNICODE_STATISTICS)) == 0 )    )
    {
        CPINFO cpiInfo;
        CHAR* pch= (CHAR*)lpstrInputStream;
        INT  cb;

        //
        // If the result depends only upon statistics, check
        // to see if there is a possibility of DBCS.
        // Only do this check if the ansi code page is DBCS
        //

        GetCPInfo( CP_ACP, &cpiInfo);

        if( cpiInfo.MaxCharSize > 1 )
        {
            for( cb=0; cb<iLen; cb++ )
            {
                if( IsDBCSLeadByte(*pch++) )
                {
                    return FALSE;
                }
            }
        }
     }

     return bUnicode;
}
Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.