Python naive Bayesian classification mnist datasets

Last Update:2018-07-28 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Implementation code:
Import struct
From numpy Import *
Import NumPy as NP
Import time
def read_image (file_name):
#先用二进制方式把文件都读进来
File_handle=open (file_name, "RB") #以二进制打开文档
File_content=file_handle.read () #读取到缓冲区中
Offset=0
Head = Struct.unpack_from (' >iiii ', file_content, offset) # takes the first 4 integers, returns a tuple
Offset + = struct.calcsize (' &GT;IIII ')
Imgnum = head[1] #图片数
rows = head[2] #宽度
cols = head[3] #高度

Images=np.empty ((Imgnum, 784)) #empty, is that all elements in the array that it is common are empty and have no practical meaning, it is the fastest way to create an array
image_size=rows*cols# the size of a single picture
Fmt= ' > ' + str (image_size) + ' B ' #单个图片的format

For I in Range (Imgnum):
Images[i] = Np.array (Struct.unpack_from (FMT, file_content, offset))
# Images[i] = Np.array (Struct.unpack_from (FMT, file_content, offset)). Reshape ((rows, cols))
Offset + = struct.calcsize (FMT)
return images

#读取标签
def read_label (file_name):
File_handle = open (file_name, "RB") # Opens the document in binary
File_content = File_handle.read () # read into buffer

Head = Struct.unpack_from (' >ii ', file_content, 0) # takes the first 2 integers, returns a tuple
offset = struct.calcsize (' >ii ')

Labelnum = head[1] # label number
# Print (Labelnum)
bitsstring = ' > ' + str (labelnum) + ' B ' # FMT format: ' >47040000b '
Label = Struct.unpack_from (bitsstring, file_content, offset) # takes data, returns a tuple
Return Np.array (label)

Def loaddataset ():
#mnist
Train_x_filename= "Train-images-idx3-ubyte"
Train_y_filename= "Train-labels-idx1-ubyte"
Test_x_filename= "T10k-images-idx3-ubyte"
Test_y_filename= "T10k-labels-idx1-ubyte"

# #fashion Mnist
# train_x_filename= "Fashion-train-images-idx3-ubyte"
# train_y_filename= "Fashion-train-labels-idx1-ubyte"
# test_x_filename= "Fashion-t10k-images-idx3-ubyte"
# test_y_filename= "Fashion-t10k-labels-idx1-ubyte"

Train_x=read_image (train_x_filename) #60000 matrix of *784
Matrix of Train_y=read_label (train_y_filename) #60000 * *
Test_x=read_image (test_x_filename) #10000 *784
Test_y=read_label (test_y_filename) #10000 * * *

Train_x=normalize (train_x)
Test_x=normalize (test_x)
# #调试的时候让速度快点, reduce the size of the dataset first
# train_x=train_x[0:1000,:]
# train_y=train_y[0:1000]
# test_x=test_x[0:500,:]
# test_y=test_y[0:500]

Return train_x, test_x, train_y, test_y

def normalize (data): #图片像素二值化, becomes 0-1 distribution
M=DATA.SHAPE[0]
N=np.array (data). Shape[1]
For I in range (m):
For j in Range (N):
If data[i,j]!=0:
Data[i,j]=1
Else
Data[i,j]=0
Return data

# (1) Calculate prior probability and conditional probability
def train_model (train_x,train_y,classnum): #classNum是指有10个类别, here the train_x is already binary,
M=TRAIN_X.SHAPE[0]
N=TRAIN_X.SHAPE[1]
# Prior_probability=np.zeros (n) #先验概率
Prior_probability=np.zeros (Classnum) #先验概率
Conditional_probability=np.zeros ((classnum,n,2)) #条件概率
#计算先验概率和条件概率
For I in range (m): #m是图片数量 of 60000 Total
Img=train_x[i] #img是第i个图片, is the 1*n line vector
Label=train_y[i] #label是第i个图片对应的label
prior_probability[label]+=1# Statistics Label class label number (P (y=ck), subscript to hold Label,prior_probability[label] divided by n is a priori probability of a class
For j in Range (N): #n是特征数 of 784
Temp=img[j].astype (int) #img [j] is 0.0, put to subscript will show error, only integer

Conditional_probability[label][j][temp] + = 1

# conditional_probability[label][j][img[j]]+=1# statistics are Class label, in each column is 1 or 0 of the number of rows, Img[j] value is either 0 or 1, the probability of the calculation of the condition

#将概率归到 [1.10001]
For I in Range (Classnum):
For j in Range (N):
#经过二值化的图像只有0, 12 kinds of values
PIX_0=CONDITIONAL_PROBABILITY[I][J][0]
PIX_1=CONDITIONAL_PROBABILITY[I][J][1]

#计算0, the conditional probabilities of 1 pixel points
probability_0= (float (pix_0)/float (pix_0+pix_1)) *10000+1
Probability_1 = (float (pix_1)/float (pix_0 + pix_1)) * 10000 + 1

Conditional_probability[i][j][0]=probability_0
Conditional_probability[i][j][1]=probability_1
Return prior_probability,conditional_probability

# (2) The product of a priori probability and a conditional probability for a given X
def cal_probability (img,label,prior_probability,conditional_probability):
Probability=int (Prior_probability[label]) #先验概率
N=IMG.SHAPE[0]
# print (n)
For I in range (n): #应该是特征数
Probability*=int (Conditional_probability[label][i][img[i].astype (int)])

return probability

#确定实例x的类, equivalent to Argmax
def predict (test_x,test_y,prior_probability,conditional_probability): #传进来的test_x或者是train_x都是二值化后的
Predict_y=[]
M=TEST_X.SHAPE[0]
N=TEST_X.SHAPE[1]
For I in range (m):
Img=np.array (Test_x[i]) #img已经是二值化以后的列向量
Label=test_y[i]
Max_label=0
max_probability= cal_probability (img,0,prior_probability,conditional_probability)
For j in Range (1,10): #从下标为1开始 Because the initial value is subscript 0
Probability=cal_probability (img,j,prior_probability,conditional_probability)
If max_probability<probability:
Max_probability=probability
Max_label=j
Predict_y.append (Max_label) #用来记录每行最大概率的label
Return Np.array (predict_y)

def cal_accuracy (test_y,predict_y):
M=TEST_Y.SHAPE[0]
errorcount=0.0
For I in range (m):
If Test_y[i]!=predict_y[i]:
Errorcount+=1
Accuracy=1.0-float (errorcount)/M
Return accuracy

If __name__== ' __main__ ':
classnum=10
Print ("Start reading data ...")
Time1=time.time ()
Train_x, test_x, train_y, Test_y=loaddataset ()
Train_x=normalize (train_x)
Test_x=normalize (test_x)

Time2=time.time ()
Print ("Read data cost", Time2-time1, "second")

Print ("Start training data ...")
Prior_probability, Conditional_probability=train_model (train_x,train_y,classnum)
For I in Range (Classnum):
Print (Prior_probability[i]) #输出一下每个标签的总共数量
Time3=time.time ()
Print ("Train data Cost", Time3-time2, "second")

Print ("Start predicting data ...")
Predict_y=predict (test_x,test_y,prior_probability,conditional_probability)
Time4=time.time ()
Print ("Predict data Cost", Time4-time3, "second")

Print ("Start calculate accuracy ...")
Acc=cal_accuracy (test_y,predict_y)
Time5=time.time ()
Print ("Accuarcy", ACC)
Print ("Calculate Accuarcy cost", Time5-time4, "second")
Result: output of 5923.0 ... These are some of the pictures I've exported for each category.

Call yourself write Naïve Bayes function The correct rate is 84.12%, call Sklearn in the BERNOULLINB function, the correct rate is 84.27%

The code for calling the BERNOULLINB function in Sklearn is as follows:

Result screenshot:

Optimization: Add the Principal component analysis method to reduce the dimension operation, the code is as follows:

Result screenshot:

Pending modification!

Reference links; 51967839

Python naive Bayesian classification mnist datasets

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python naive Bayesian classification mnist datasets

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python naive Bayesian classification mnist datasets

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support