#-*-Coding:utf-8-*-
"""
Created on Wed June 28 18:42:33 2017
@author: LMT
"""
Import re
Import NumPy as NP
‘‘‘
The program realizes the alignment of the lexical relationship with the alignment bilingual corpus extraction after giza++.
Establish a mapping matrix of the source language to the target language, numbering starting with 0, and the alignment of NULL in the file as the first word
If there is an alignment relationship between words, the alignment matrix Matrixst[s][t] Position value is set to 1, and the other is 0
‘‘‘
def alig_pairs (filepath):
Matrixzeroone = []
PATTERN1 = re.compile (R ' \ (\{([0-9]*) \}\)?)
# Print (PATTERN1)
f = open (filepath, ' R ') #,encoding= ' Utf-8 ')
Line=f.readline ()
#matrix = Np.zeros ()
while (True):
If not line:
Break
target = F.readline (). Strip (). Split ()
Source = F.readline (). Strip ()
#match = Pattern1.findall (source) # matches text using pattern, gets match result, cannot match when will return none
Source_word = pattern1.split (source)
# Print (Source_word)
s_l = Len (source_word)//2-1#-1 does not consider null
t_l = Len (target)
#print (s_l)
#print (t_l)
Matrixts = Np.zeros ((t_l,s_l))
#print (Matrixst.shape)
#从null开始对齐i = 0. If NULL is not considered, starting with the second bit, i=2
i=2
while (I < Len (Source_word)-2):
index = source_word[i+1]
If Index! = ' and Index! = ':
s = Index.strip (). Split ()
# print (s)
For S_ind in S:
#设置对齐矩阵
Matrixts[int (S_ind) -1][int ((i-2))//2]=1
#print (i//2-1)
#该语句抽取对齐词语队
#print (Source_word[int (i)],target[int (S_ind)-1])
i+=2
# Print (matrixts)
Matrixzeroone.append (matrixts)
# Print (Matrixts.shape)
#因为对齐这个矩阵是动态生成的, so this is where the matrix is merged
#print (i)
#print (Source_word)
#if match:
#print (Match)
#print (' yes ')
Line=f.readline ()
#print (target)
#print (source)
F.close ()
Return Matrixzeroone
#alig_pairs (' Test.txt ')
#alig_pairs (' 117-06-28.183340.LMT. A3.final ')
--------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------------------
#-*-Coding:utf-8-*-
Import Codecs
Def Get_matrix ():
#print (' program enters process ')
Chinese = codecs.open ("Result/result_cn", ' r ', encoding = ' utf-8 ')
中文版 = Codecs.open (' result/result_en ', ' r ', encoding = ' utf-8 ')
# Result_eng = Codecs.open (' result/swap_en ', ' w ', encoding = ' utf-8 ')
# Result_chi = Codecs.open (' result/swap_cn ', ' w ', encoding = ' utf-8 ')
# Eng_chi = Codecs.open (' result/en_to_cn ', ' w ', encoding = ' utf-8 ')
English_sentence_count = 0
Chinese_sentence_count = 0
Chinese_word = []
Chinese_sentence = []
For line in Chinese.readlines ():
Pair = Line.strip (). Split ()
If Len (pair) = = 4:
Swap = Pair[1]
PAIR[1] = pair[2]
PAIR[2] = Swap
s = pair[0] + "" + pair[1] + "" + pair[2] + "+ pair[3]
Chinese_word.append (s)
# Result_chi.write (pair[0] + "" + pair[1] + "+ pair[2" + "+ pair[3] +" \ n ")
If Len (pair) = = 0:
Chinese_sentence.append (Chinese_word)
Chinese_word = []
# result_chi.write ("\ n")
Chinese_sentence_count + = 1
English_word = []
English_sentence = []
For line in English.readlines ():
Pair = Line.strip (). Split ()
If Len (pair) = = 4:
Swap = Pair[1]
PAIR[1] = pair[2]
PAIR[2] = Swap
s = pair[0] + "" + pair[1] + "" + pair[2] + "+ pair[3]
English_word.append (s)
# Result_eng.write (pair[0] + "" + pair[1] + "+ pair[2" + "+ pair[3] +" \ n ")
If Len (pair) = = 0:
English_sentence.append (English_word)
English_word = []
# result_eng.write ("\ n")
English_sentence_count + = 1
If English_sentence_count < chinese_sentence_count:
Min_count = English_sentence_count
Else
Min_count = Chinese_sentence_count
Matrix = []
If Len (english_sentence) = = Len (chinese_sentence):
i = 0
While I < Len (english_sentence):
Chinese_sentence_length = Len (Chinese_sentence[i])
English_sentence_length = Len (english_sentence[i]) #获得当前句子的行列值
English_chinese = [["0" for Col in range (English_sentence_length + 1)] for row in range (Chinese_sentence_length + 1)]
Col = 1
While Col <= English_sentence_length:
English_chinese[0][col] = English_sentence[i][col-1]
Col + = 1
row = 1
While row <= Chinese_sentence_length:
English_chinese[row][0] = chinese_sentence[i][row-1]
Row + = 1
# for row in range (Chinese_sentence_length):
# for Col in Range (english_sentence_length):
# Eng_chi.write (English_chinese[row][col] + "")
# eng_chi.write ("\ n")
# eng_chi.write ("\ n")
#每次放进去的矩阵, actually the size is not the same big
Matrix.append (English_chinese)
i = i + 1
Else
Print (' ERROR ')
# for J in range (Len (matrix)):
# for row in range (Len (Matrix[j])):
# s = ""
# for Col in range (len (Matrix[j][row])):
# s + = Matrix[j][row][col]
# s + = ""
# print (s)
Return matrix, Chinese_sentence
#matrix, _ = Get_matrix ()
--------------------------------------------------------------------------------------------------------------- --------------------------------------------------------
#-*-coding:utf-8-*-
Import OS
Import string
def count (filepath):
Total = 0 #总行数
Countpound = 0 #注释行数
CountBlank = 0 #空行数
Line = open (filepath, ' R ') #,encoding= ' Utf-8 ')
For Li in Line.readlines (): #readlines () read through the entire file at once
Total + = 1
If not Li.split (): #判断是否为空行
CountBlank +=1
Li.strip ()
If Li.startswith (' # '):
Countpound + = 1
Print (file)
Print ("countblank:%d"% CountBlank)
Print ("countpound:%d"% countpound)
Print ("total:%d"% total)
Count (' RESULT_CN ')
--------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------
#-*-coding:utf-8-*-
Def Bijiao ():
F1=open (' Lmt.txt ', ' R ')
F2=open (' Lh.txt ', ' R ')
Count=0 #统计行数
Dif=[] #统计不同的数量序列
For a in F1:
B=f2.readline ()
Count+=1
If a!=b:
Dif.append (count)
F1.close ()
F2.close ()
Return dif
C=bijiao ()
If c==0:
Print (' Two files like! ‘)
Else
Print ('%d different '% Len (c))
For each in D:
Print ('%d lines are different '% each)
Common python files