The following tool determines whether unicode is a Chinese character, number, English, or other character. Full-width to half-width. Unicode string normalization. #! /Usr/bin/env python #-*-Coding: GBK -*- "Chinese character processing tool: Determines whether unicode is a Chinese character, number, English, or other character. Full-width to half-width. """ Def is_chinese (uchar ): "Determining whether a unicode character is a Chinese character """ If uchar> = u'u4e00' and uchar <= u'u9fa5 ': Return True Else: Return False Def is_number (uchar ): "Determining whether a unicode is a number """ If uchar> = u'u0030' and uchar <= u'u0039 ': Return True Else: Return False Def is_alphabet (uchar ): "Determining whether a unicode character is an English letter """ If (uchar> = u'u0041 'and uchar <= u'u005a') or (uchar> = u'u0061 'and uchar <= u'u007a '): Return True Else: Return False Def is_other (uchar ): "Judge whether it is not a Chinese character, number or English character """ If not (is_chinese (uchar) or is_number (uchar) or is_alphabet (uchar )): Return True Else: Return False Def B2Q (uchar ): "Halfwidth to fullwidth """ Inside_code = ord (uchar) If inside_code <0x0020 or inside_code> 0x7e:# Returns the original character if it is not a halfwidth character Return uchar If inside_code = 0x0020: # except for spaces, the formula for the full-width half-width is: half-width = full-width-0xfee0. Inside_code = 0x3000 Else: Inside_code + = 0xfee0 Return unichr (inside_code) Def Q2B (uchar ): "Turn all corners to half corners """ Inside_code = ord (uchar) If inside_code = 0x3000: Inside_code = 0x0020 Else: Inside_code-= 0xfee0 If inside_code <0x0020 or inside_code> 0x7e:# The original character is returned after the conversion is not a halfwidth character Return uchar Return unichr (inside_code) Def stringQ2B (ustring ): "Turn full-width character strings to half-width """ Return "". join ([Q2B (uchar) for uchar in ustring]) Def uniform (ustring ): "Format the string to complete full-width, half-width, and lowercase conversion """ Return stringQ2B (ustring). lower () Def string2List (ustring ): "Separate ustring from Chinese characters, letters, and numbers """ RetList = [] Utmp = [] For uchar in ustring: If is_other (uchar ): If len (utmp) = 0: Continue Else: RetList. append ("". join (utmp )) Utmp = [] Else: Utmp. append (uchar) If len (utmp )! = 0: RetList. append ("". join (utmp )) Return retList If _ name __= = "_ main __": # Test Q2B and B2Q For I in range (0x0020, 0x007F ): Print Q2B (B2Q (unichr (I), B2Q (unichr (I )) # Test uniform Ustring = u'high frequency A' of Chinese name' Ustring = uniform (ustring) Ret = string2List (ustring) Print ret |