Implementation code:
Import struct
From numpy Import *
Import NumPy as NP
Import time
def read_image (file_name):
#先用二进制方式把文件都读进来
File_handle=open (file_name, "RB") #以二进制打开文档
File_content=file_handle.read () #读取到缓冲区中
Offset=0
Head = Struct.unpack_from (' >iiii ', file_content, offset) # takes the first 4 integers, returns a tuple
Offset + = struct.calcsize (' >IIII ')
Imgnum = head[1] #图片数
rows = head[2] #宽度
cols = head[3] #高度
Images=np.empty ((Imgnum, 784)) #empty, is that all elements in the array that it is common are empty and have no practical meaning, it is the fastest way to create an array
image_size=rows*cols# the size of a single picture
Fmt= ' > ' + str (image_size) + ' B ' #单个图片的format
For I in Range (Imgnum):
Images[i] = Np.array (Struct.unpack_from (FMT, file_content, offset))
# Images[i] = Np.array (Struct.unpack_from (FMT, file_content, offset)). Reshape ((rows, cols))
Offset + = struct.calcsize (FMT)
return images
#读取标签
def read_label (file_name):
File_handle = open (file_name, "RB") # Opens the document in binary
File_content = File_handle.read () # read into buffer
Head = Struct.unpack_from (' >ii ', file_content, 0) # takes the first 2 integers, returns a tuple
offset = struct.calcsize (' >ii ')
Labelnum = head[1] # label number
# Print (Labelnum)
bitsstring = ' > ' + str (labelnum) + ' B ' # FMT format: ' >47040000b '
Label = Struct.unpack_from (bitsstring, file_content, offset) # takes data, returns a tuple
Return Np.array (label)
Def loaddataset ():
#mnist
Train_x_filename= "Train-images-idx3-ubyte"
Train_y_filename= "Train-labels-idx1-ubyte"
Test_x_filename= "T10k-images-idx3-ubyte"
Test_y_filename= "T10k-labels-idx1-ubyte"
# #fashion Mnist
# train_x_filename= "Fashion-train-images-idx3-ubyte"
# train_y_filename= "Fashion-train-labels-idx1-ubyte"
# test_x_filename= "Fashion-t10k-images-idx3-ubyte"
# test_y_filename= "Fashion-t10k-labels-idx1-ubyte"
Train_x=read_image (train_x_filename) #60000 matrix of *784
Matrix of Train_y=read_label (train_y_filename) #60000 * *
Test_x=read_image (test_x_filename) #10000 *784
Test_y=read_label (test_y_filename) #10000 * * *
Train_x=normalize (train_x)
Test_x=normalize (test_x)
# #调试的时候让速度快点, reduce the size of the dataset first
# train_x=train_x[0:1000,:]
# train_y=train_y[0:1000]
# test_x=test_x[0:500,:]
# test_y=test_y[0:500]
Return train_x, test_x, train_y, test_y
def normalize (data): #图片像素二值化, becomes 0-1 distribution
M=DATA.SHAPE[0]
N=np.array (data). Shape[1]
For I in range (m):
For j in Range (N):
If data[i,j]!=0:
Data[i,j]=1
Else
Data[i,j]=0
Return data
# (1) Calculate prior probability and conditional probability
def train_model (train_x,train_y,classnum): #classNum是指有10个类别, here the train_x is already binary,
M=TRAIN_X.SHAPE[0]
N=TRAIN_X.SHAPE[1]
# Prior_probability=np.zeros (n) #先验概率
Prior_probability=np.zeros (Classnum) #先验概率
Conditional_probability=np.zeros ((classnum,n,2)) #条件概率
#计算先验概率和条件概率
For I in range (m): #m是图片数量 of 60000 Total
Img=train_x[i] #img是第i个图片, is the 1*n line vector
Label=train_y[i] #label是第i个图片对应的label
prior_probability[label]+=1# Statistics Label class label number (P (y=ck), subscript to hold Label,prior_probability[label] divided by n is a priori probability of a class
For j in Range (N): #n是特征数 of 784
Temp=img[j].astype (int) #img [j] is 0.0, put to subscript will show error, only integer
Conditional_probability[label][j][temp] + = 1
# conditional_probability[label][j][img[j]]+=1# statistics are Class label, in each column is 1 or 0 of the number of rows, Img[j] value is either 0 or 1, the probability of the calculation of the condition
#将概率归到 [1.10001]
For I in Range (Classnum):
For j in Range (N):
#经过二值化的图像只有0, 12 kinds of values
PIX_0=CONDITIONAL_PROBABILITY[I][J][0]
PIX_1=CONDITIONAL_PROBABILITY[I][J][1]
#计算0, the conditional probabilities of 1 pixel points
probability_0= (float (pix_0)/float (pix_0+pix_1)) *10000+1
Probability_1 = (float (pix_1)/float (pix_0 + pix_1)) * 10000 + 1
Conditional_probability[i][j][0]=probability_0
Conditional_probability[i][j][1]=probability_1
Return prior_probability,conditional_probability
# (2) The product of a priori probability and a conditional probability for a given X
def cal_probability (img,label,prior_probability,conditional_probability):
Probability=int (Prior_probability[label]) #先验概率
N=IMG.SHAPE[0]
# print (n)
For I in range (n): #应该是特征数
Probability*=int (Conditional_probability[label][i][img[i].astype (int)])
return probability
#确定实例x的类, equivalent to Argmax
def predict (test_x,test_y,prior_probability,conditional_probability): #传进来的test_x或者是train_x都是二值化后的
Predict_y=[]
M=TEST_X.SHAPE[0]
N=TEST_X.SHAPE[1]
For I in range (m):
Img=np.array (Test_x[i]) #img已经是二值化以后的列向量
Label=test_y[i]
Max_label=0
max_probability= cal_probability (img,0,prior_probability,conditional_probability)
For j in Range (1,10): #从下标为1开始 Because the initial value is subscript 0
Probability=cal_probability (img,j,prior_probability,conditional_probability)
If max_probability<probability:
Max_probability=probability
Max_label=j
Predict_y.append (Max_label) #用来记录每行最大概率的label
Return Np.array (predict_y)
def cal_accuracy (test_y,predict_y):
M=TEST_Y.SHAPE[0]
errorcount=0.0
For I in range (m):
If Test_y[i]!=predict_y[i]:
Errorcount+=1
Accuracy=1.0-float (errorcount)/M
Return accuracy
If __name__== ' __main__ ':
classnum=10
Print ("Start reading data ...")
Time1=time.time ()
Train_x, test_x, train_y, Test_y=loaddataset ()
Train_x=normalize (train_x)
Test_x=normalize (test_x)
Time2=time.time ()
Print ("Read data cost", Time2-time1, "second")
Print ("Start training data ...")
Prior_probability, Conditional_probability=train_model (train_x,train_y,classnum)
For I in Range (Classnum):
Print (Prior_probability[i]) #输出一下每个标签的总共数量
Time3=time.time ()
Print ("Train data Cost", Time3-time2, "second")
Print ("Start predicting data ...")
Predict_y=predict (test_x,test_y,prior_probability,conditional_probability)
Time4=time.time ()
Print ("Predict data Cost", Time4-time3, "second")
Print ("Start calculate accuracy ...")
Acc=cal_accuracy (test_y,predict_y)
Time5=time.time ()
Print ("Accuarcy", ACC)
Print ("Calculate Accuarcy cost", Time5-time4, "second")
Result: output of 5923.0 ... These are some of the pictures I've exported for each category.
Call yourself write Naïve Bayes function The correct rate is 84.12%, call Sklearn in the BERNOULLINB function, the correct rate is 84.27%
The code for calling the BERNOULLINB function in Sklearn is as follows:
Result screenshot:
Optimization: Add the Principal component analysis method to reduce the dimension operation, the code is as follows:
Result screenshot:
Pending modification!
Reference links; 51967839
Python naive Bayesian classification mnist datasets