python尋找中文字元

來源:互聯網
上載者:User

#filename Seek.py
import unicodedata

import sys
import os
class Seek():
    """
    功能:尋找中文,並替換成指定字元或字串
    使用方法:python指令碼用法
    參數說明:
        -d    : 檔案目錄(絕對或相對路徑)(預設為指令碼所在目錄)
        -t    : 檔案類型(檔案名稱尾碼,如.jsp;.txt)(預設為所有檔案)
        -sf   : 是否包括子目錄(Y|N)(option,預設不包括子目錄)
        -r    : 輸出檔案名(option,預設為'ChineseCharacter.txt'),位於指令碼目錄下
        -encoding:檔案字元編碼(預設為utf-8)

    """
       
    def __init__(self):
        """
        初始化尋找程式
        參數解析

        """
        #TODO:
        self.d = '.'
        self.sf = 'N'
        self.t = 'ALL'
        self.r = 'ChineseCharacter.txt'
        self.encoding = 'utf-8'
        varL = 0
        for ar in sys.argv:
            if ar == '-d':
                self.d = sys.argv[varL+1]
                continue

            if ar =='-sf':
                sf = sys.argv[varL+1].upper()
                if (sf == 'Y') | (sf == 'N'):
                    self.sf = sf
                else:
                    print('input error with sf parameter')
                continue

            if ar=='-r':
                self.r = sys.argv[varL+1]
                continue

            if ar=='-t':
                self.t = sys.argv[varL+1]
                continue
               
            if ar == '-encoding':
                self.encoding = sys.argv[varL+1]
                continue

            varL+=1

    def seeking(self):
        """"
        開始尋找字元
       
        """
        try:
            #output file
            self.rfile = open(self.r,'w',encoding=self.encoding)
            #start seek
            for f in os.listdir(self.d):
                path = os.path.join(self.d,f)
                if self.__isFile(path):
                    if self.t != 'ALL':
                        if f.endswith(self.t):
                            self.__seek(path)
                    else:
                        self.__seek(path)
                elif self.__isDir(path) and self.sf == 'Y' :
                    #seek the sub folder when the self.sf equals 'Y'
                    self.start(path)
        except Exception as error:
            print('seek error %s' % error)
        finally:
            self.__close()
      
               
    def __close(self):
        """
        關閉檔案及輸入資料流和輸出資料流

        """
        #close the stream and file
        self.rfile.close()

    def __isFile(self,file):
        #
        return os.path.isfile(file)
   
    def __isDir(self,path):
        #
        return os.path.isdir(path)
   
    def __openFile(self,file):
        pass
   
    def __closeFile(self,file):
        file.close()

    def __seek(self,file):
        """
        尋找
       
        """
        #seek character
        fileObj = open(file,'r',encoding=self.encoding)
        lineList = fileObj.readlines()
        #塊注釋標記
        blockComment = 'finish'
        try:
            isC = False
            for line in lineList:
                #尋找出注釋部分,並跳過
                #
                #跳過'/*'和'*/'中的內容,處理剩餘的內容
                if blockComment == 'start':
                    #塊注釋內容
                    index = line.find('*/')
                    if index != -1:
                        blockComment = 'finish'
                        #塊注釋結束
                        #處理當前行'*/'後的內容
                        line = line[index+2:]
                    else:
                        #仍處於塊注釋內容中,跳過
                        continue
                if line.startswith('//'):
                    #行注釋
                    #跳過行
                    continue
                if line.startswith('/*'):
                    #塊注釋開始
                    blockComment = 'start'
                    continue

                #尋找字元
                indexTag = 0;
                for s in line:
                    sIndex = line.index(s)
                    try:
                        #將不是LATIN開頭的字元都找出來
                        if unicodedata.name(s).startswith('CJK') ==
True:
                            #TODO
                            #content = lineList.index(line)+1+s
                            isC = True
                            #如果兩個字元間隔大於1,表示為不連續的中文
                            if (sIndex - indexTag) > 1 :
                                self.__writeFile('/t'+s)
                            else:
                                self.__writeFile(s)
                            indexTag = sIndex
                    except Exception as error:
                        print('seek character error : %s in %s' %
(error,fileObj.name))
                        continue
                if isC:
                    for t in range(8):
                        self.__writeFile('/t')
                    self.__writeFile('line:')
                    self.__writeFile('%d' % (lineList.index(line)+1))
                    self.__writeFile('/n')
                isC = False
               
        finally:
            self.__writeFile('/n')
            self.__writeFile('------------'+fileObj.name)
            self.__writeFile('/n')
            fileObj.close()
       
    def __writeFile(self,content):
        self.rfile.write(content)
       
       
if __name__ == '__main__':
    seek=Seek()
    seek.seeking()

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.