Python naive Bayesian classification mnist datasets

Source: Internet
Author: User

Implementation code:
Import struct
From numpy Import *
Import NumPy as NP
Import time
def read_image (file_name):
#先用二进制方式把文件都读进来
File_handle=open (file_name, "RB") #以二进制打开文档
File_content=file_handle.read () #读取到缓冲区中
Offset=0
Head = Struct.unpack_from (' >iiii ', file_content, offset) # takes the first 4 integers, returns a tuple
Offset + = struct.calcsize (' >IIII ')
Imgnum = head[1] #图片数
rows = head[2] #宽度
cols = head[3] #高度

Images=np.empty ((Imgnum, 784)) #empty, is that all elements in the array that it is common are empty and have no practical meaning, it is the fastest way to create an array
image_size=rows*cols# the size of a single picture
Fmt= ' > ' + str (image_size) + ' B ' #单个图片的format

For I in Range (Imgnum):
Images[i] = Np.array (Struct.unpack_from (FMT, file_content, offset))
# Images[i] = Np.array (Struct.unpack_from (FMT, file_content, offset)). Reshape ((rows, cols))
Offset + = struct.calcsize (FMT)
return images

#读取标签
def read_label (file_name):
File_handle = open (file_name, "RB") # Opens the document in binary
File_content = File_handle.read () # read into buffer

Head = Struct.unpack_from (' >ii ', file_content, 0) # takes the first 2 integers, returns a tuple
offset = struct.calcsize (' >ii ')

Labelnum = head[1] # label number
# Print (Labelnum)
bitsstring = ' > ' + str (labelnum) + ' B ' # FMT format: ' >47040000b '
Label = Struct.unpack_from (bitsstring, file_content, offset) # takes data, returns a tuple
Return Np.array (label)

Def loaddataset ():
#mnist
Train_x_filename= "Train-images-idx3-ubyte"
Train_y_filename= "Train-labels-idx1-ubyte"
Test_x_filename= "T10k-images-idx3-ubyte"
Test_y_filename= "T10k-labels-idx1-ubyte"

# #fashion Mnist
# train_x_filename= "Fashion-train-images-idx3-ubyte"
# train_y_filename= "Fashion-train-labels-idx1-ubyte"
# test_x_filename= "Fashion-t10k-images-idx3-ubyte"
# test_y_filename= "Fashion-t10k-labels-idx1-ubyte"

Train_x=read_image (train_x_filename) #60000 matrix of *784
Matrix of Train_y=read_label (train_y_filename) #60000 * *
Test_x=read_image (test_x_filename) #10000 *784
Test_y=read_label (test_y_filename) #10000 * * *

Train_x=normalize (train_x)
Test_x=normalize (test_x)
# #调试的时候让速度快点, reduce the size of the dataset first
# train_x=train_x[0:1000,:]
# train_y=train_y[0:1000]
# test_x=test_x[0:500,:]
# test_y=test_y[0:500]

Return train_x, test_x, train_y, test_y

def normalize (data): #图片像素二值化, becomes 0-1 distribution
M=DATA.SHAPE[0]
N=np.array (data). Shape[1]
For I in range (m):
For j in Range (N):
If data[i,j]!=0:
Data[i,j]=1
Else
Data[i,j]=0
Return data

# (1) Calculate prior probability and conditional probability
def train_model (train_x,train_y,classnum): #classNum是指有10个类别, here the train_x is already binary,
M=TRAIN_X.SHAPE[0]
N=TRAIN_X.SHAPE[1]
# Prior_probability=np.zeros (n) #先验概率
Prior_probability=np.zeros (Classnum) #先验概率
Conditional_probability=np.zeros ((classnum,n,2)) #条件概率
#计算先验概率和条件概率
For I in range (m): #m是图片数量 of 60000 Total
Img=train_x[i] #img是第i个图片, is the 1*n line vector
Label=train_y[i] #label是第i个图片对应的label
prior_probability[label]+=1# Statistics Label class label number (P (y=ck), subscript to hold Label,prior_probability[label] divided by n is a priori probability of a class
For j in Range (N): #n是特征数 of 784
Temp=img[j].astype (int) #img [j] is 0.0, put to subscript will show error, only integer

Conditional_probability[label][j][temp] + = 1

# conditional_probability[label][j][img[j]]+=1# statistics are Class label, in each column is 1 or 0 of the number of rows, Img[j] value is either 0 or 1, the probability of the calculation of the condition

#将概率归到 [1.10001]
For I in Range (Classnum):
For j in Range (N):
#经过二值化的图像只有0, 12 kinds of values
PIX_0=CONDITIONAL_PROBABILITY[I][J][0]
PIX_1=CONDITIONAL_PROBABILITY[I][J][1]

#计算0, the conditional probabilities of 1 pixel points
probability_0= (float (pix_0)/float (pix_0+pix_1)) *10000+1
Probability_1 = (float (pix_1)/float (pix_0 + pix_1)) * 10000 + 1

Conditional_probability[i][j][0]=probability_0
Conditional_probability[i][j][1]=probability_1
Return prior_probability,conditional_probability

# (2) The product of a priori probability and a conditional probability for a given X
def cal_probability (img,label,prior_probability,conditional_probability):
Probability=int (Prior_probability[label]) #先验概率
N=IMG.SHAPE[0]
# print (n)
For I in range (n): #应该是特征数
Probability*=int (Conditional_probability[label][i][img[i].astype (int)])

return probability

#确定实例x的类, equivalent to Argmax
def predict (test_x,test_y,prior_probability,conditional_probability): #传进来的test_x或者是train_x都是二值化后的
Predict_y=[]
M=TEST_X.SHAPE[0]
N=TEST_X.SHAPE[1]
For I in range (m):
Img=np.array (Test_x[i]) #img已经是二值化以后的列向量
Label=test_y[i]
Max_label=0
max_probability= cal_probability (img,0,prior_probability,conditional_probability)
For j in Range (1,10): #从下标为1开始 Because the initial value is subscript 0
Probability=cal_probability (img,j,prior_probability,conditional_probability)
If max_probability<probability:
Max_probability=probability
Max_label=j
Predict_y.append (Max_label) #用来记录每行最大概率的label
Return Np.array (predict_y)

def cal_accuracy (test_y,predict_y):
M=TEST_Y.SHAPE[0]
errorcount=0.0
For I in range (m):
If Test_y[i]!=predict_y[i]:
Errorcount+=1
Accuracy=1.0-float (errorcount)/M
Return accuracy

If __name__== ' __main__ ':
classnum=10
Print ("Start reading data ...")
Time1=time.time ()
Train_x, test_x, train_y, Test_y=loaddataset ()
Train_x=normalize (train_x)
Test_x=normalize (test_x)

Time2=time.time ()
Print ("Read data cost", Time2-time1, "second")

Print ("Start training data ...")
Prior_probability, Conditional_probability=train_model (train_x,train_y,classnum)
For I in Range (Classnum):
Print (Prior_probability[i]) #输出一下每个标签的总共数量
Time3=time.time ()
Print ("Train data Cost", Time3-time2, "second")

Print ("Start predicting data ...")
Predict_y=predict (test_x,test_y,prior_probability,conditional_probability)
Time4=time.time ()
Print ("Predict data Cost", Time4-time3, "second")

Print ("Start calculate accuracy ...")
Acc=cal_accuracy (test_y,predict_y)
Time5=time.time ()
Print ("Accuarcy", ACC)
Print ("Calculate Accuarcy cost", Time5-time4, "second")
Result: output of 5923.0 ... These are some of the pictures I've exported for each category.

Call yourself write Naïve Bayes function The correct rate is 84.12%, call Sklearn in the BERNOULLINB function, the correct rate is 84.27%

The code for calling the BERNOULLINB function in Sklearn is as follows:

Result screenshot:

Optimization: Add the Principal component analysis method to reduce the dimension operation, the code is as follows:

Result screenshot:

Pending modification!

Reference links; 51967839

Python naive Bayesian classification mnist datasets

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.