The following tool determines whether unicode is a Chinese character, number, English, or other character. Full-width to half-width. Unicode string normalization. There is also a program that can process the conversion of Chinese characters into pinyin, which is still being compiled.
#! /Usr/bin/env python
#-*-Coding: GBK -*-
"Chinese character processing tool:
Determines whether unicode is a Chinese character, number, English, or other character.
Full-width to half-width. """
Def is_chinese (uchar ):
"Determining whether a unicode character is a Chinese character """
If uchar> = U' \ u4e00' and uchar <= U' \ u9fa5 ':
Return True
Else:
Return False
Def is_number (uchar ):
"Determining whether a unicode is a number """
If uchar> = U' \ u0030' and uchar <= U' \ u0039 ':
Return True
Else:
Return False
Def is_alphabet (uchar ):
"Determining whether a unicode character is an English letter """
If (uchar> = U' \ u0041 'and uchar <= U' \ u005a') or (uchar> = U' \ u0061 'and uchar <= U' \ u007a '):
Return True
Else:
Return False
Def is_other (uchar ):
"Judge whether it is not a Chinese character, number or English character """
If not (is_chinese (uchar) or is_number (uchar) or is_alphabet (uchar )):
Return True
Else:
Return False
Def B2Q (uchar ):
"Halfwidth to fullwidth """
Inside_code = ord (uchar)
If inside_code <0x0020 or inside_code> 0x7e: # returns the original character if it is not a halfwidth character.
Return uchar
If inside_code = 0x0020: # except for spaces, the formula for the full-width half-width is: half-width = full-width-0xfee0.
Inside_code = 0x3000
Else:
Inside_code + = 0xfee0
Return unichr (inside_code)
Def Q2B (uchar ):
"Turn all corners to half corners """
Inside_code = ord (uchar)
If inside_code = 0x3000:
Inside_code = 0x0020
Else:
Inside_code-= 0xfee0
If inside_code <0x0020 or inside_code> 0x7e: # The original character is returned after the conversion is not a halfwidth character.
Return uchar
Return unichr (inside_code)
Def stringQ2B (ustring ):
"Turn full-width character strings to half-width """
Return "". join ([Q2B (uchar) for uchar in ustring])
Def uniform (ustring ):
"Format the string to complete full-width, half-width, and lowercase conversion """
Return stringQ2B (ustring). lower ()
Def string2List (ustring ):
"Separate ustring from Chinese characters, letters, and numbers """
RetList = []
Utmp = []
For uchar in ustring:
If is_other (uchar ):
If len (utmp) = 0:
Continue
Else:
RetList. append ("". join (utmp ))
Utmp = []
Else:
Utmp. append (uchar)
If len (utmp )! = 0:
RetList. append ("". join (utmp ))
Return retList
If _ name __= = "_ main __":
# Test Q2B and B2Q
For I in range (0x0020, 0x007F ):
Print Q2B (B2Q (unichr (I), B2Q (unichr (I ))
# Test uniform
Ustring = u'high frequency A' of Chinese name'
Ustring = uniform (ustring)
Ret = string2List (ustring)