#encoding: utf-8 import sys import pickle from copy import deepcopy Is_train = False Default_prob = 0.000000000001 MIN_PR OB =-1 * Float (' inf ') Train_path = "train.in" Test_path = "test.in" Output_path = "Test.out" #统计 each number of times as each probability def train (): print "Start training ..." # The following 5 elements are the HMM model parameters V = set () # observation Set Q = set () # state Set A = {} # state transition probability matrix, P (State | state), is a two-layer dict specifically pre_state-> (state->prob) B = {} # observation probability matrix, P (observation | state), is a two-layer dict specifically state-> (Observ->pro b) PI = {} # initial state probability vector # Statistical model parameters with open (Train_path, "RB") as infile:pre_s =-1 # t-1 status F or line in Infile:segs = Line.rstrip (). Split (' \ t ') If Len (segs)! = 2: # When encountering empty rows pre _s = 1 Else:o = segs[0] # t moment of observation o s = segs[1] # t moment State S # stats State S to the number of observations o b[s][o] = B.setdefault (s, {}). SetDefault (o, 0) + 1 v.add (o) q.a DD (s) if pre_s = =-1: # count the number of times each sentence begins with the first state pi[s] = Pi.setdefault (s, 0) + 1 Else: # Stats status pre_s to state s
The number of times a[pre_s][s] = A.setdefault (pre_s, {}). SetDefault (s, 0) + 1 pre_s = s #切换到下一个状态
# probability normalization for I in A.keys (): prob_sum = 0 for J in A[i].keys (): Prob_sum + = A[i][j]
For j in A[i].keys (): a[i][j] = 1.0 * A[I][J]/prob_sum for I in B.keys (): prob_sum = 0 For j in B[i].keys (): Prob_sum + = B[i][j] for j in B[i].keys (): b[i][j] = 1.0 * B[I][J]/ Prob_sum prob_sum = SUM (pi.values ()) for I in Pi.keys (): pi[i] = 1.0 * Pi[i]/prob_sum print "Finis
Hed training ... "Return A, B, Pi, V, q def Savemodel (A, B, Pi, V, q): With open (" A.param "," WB ") as outfile: Pickle.dump (A, outfile) with open ("B.param", "WB") as Outfile:pickle.dump (B, outfile) with open ("PI . Param "," WB ") as OutfilE:pickle.dump (PI, outfile) with open ("V.param", "WB") as Outfile:pickle.dump (V, outfile) with op En ("Q.param", "WB") as Outfile:pickle.dump (Q, outfile) #维特比 def predict (X, A, B, PI, V, Q): W = [{} for T ' in Range (len (X))] #相当于书上的δ path = {} for s in q:w[0][s] = 1.0 * Pi.get (S, default_prob) * B.get (s, {}). Get ( X[0], Default_prob) #0时刻状态为s的概率 path[s] = [s] for T in range (1, Len (X)): New_path = {} for S I N Q: #两轮循环暴力求解 max_prob = Min_prob max_s = "for pre_s in Q:prob = W [T-1] [pre_s] * \ a.get (pre_s, {}). Get (S, default_prob) * \ b.get (s, {}). Get (X[t], Default_prob) (Max_prob, max_s) = Max ((Max_prob, max_s), (PROB, pre_s)) #全由第一个prob决定 W[t][s]
= Max_prob #t时刻状态为s的最大概率 tmp = deepcopy (path[max_s]) tmp.append (s) new_path[s] = tmp Path = new_pAth (max_prob, max_s) = Max ((W[len (X) -1][s], s) for S in Q) # The last time the probability of each state is the largest return path[max_s] def getmodel ():
With open ("A.param", "RB") as Infile:a = Pickle.load (infile) with open ("B.param", "RB") as infile: B = Pickle.load (infile) with open ("Pi.param", "RB") as Infile:pi = Pickle.load (infile) with open ("V.PA Ram "," RB ") as Infile:v = Pickle.load (infile) with open (" Q.param "," RB ") as Infile:q = Pickle.load (i Nfile) return A, B, Pi, V, q def Test (A, B, Pi, V, q): print "Start testing" with open (Test_path, "RB") As infile, \ Open (Output_path, "WB") as Outfile:x_test = [] Y_test = [] for line in INFI Le:segs = Line.strip (). Split (' \ t ') If Len (segs)! = 2: # When encountering empty rows if Len (x_test) = = 0 : #一整句 such as nbad continue preds = Predict (X_test, A, B, PI, V, Q) for Vals In Zip (x_test, y_test,Preds): Outfile.write ("\ T". Join (Vals) + "\ n") outfile.write ("\ n") X _test = [] Y_test = [] Else:o = segs[0] # t moment of observation o s = segs[1] # T-moment status S X_test.append (o) y_test.append (s) print "finished testing" def Mai N (): If is_train:a, B, Pi, V, q = Train () Savemodel (A, B, Pi, V, q) else:a, B, Pi, V, q
= Getmodel () test (A, B, PI, V, Q) if __name__ = = ' __main__ ': Main ()
Data in Https://github.com/guotong1988/MachineLearningFromZero
refer to the statistical learning method