import numpy as np# returns the sample data set Def loaddataset (): postinglist=[[' my ', ' dog ', ' have ', ' flea ', ' problems ', ' help ', ' please '], [' Maybe ', ' not ', ' take ', ' him ', ' to ', ' dog ', ' Park ', ' stupid '], [' my ', ' dalmation ', ' is ', ' so ', ' cute ', ' I ', ' love ', ' him '], [' Stop ', ' posting ', ' stupid ', ' worthless ', ' Garbage '], [' Mr ', ' licks ', ' ate ', ' my ', ' steak ', ' how ', ' to ', ' stop ', ' him ' ], [' quit ', ' buying ', ' Worthless ', ' dog ', ' food ', ' stupid ']] classvec = [0,1,0,1,0,1] return postinglist,classvec# extracts the words from the sample data to form the Glossary def createvocablist (DataSet): vocabset = set ([]) for document in Dataset: vocabset = vocabset | set (document return list (Vocabset) #传入单词表和待分析的数据, the data is converted to vectors, where the word for each row of samples appears def Setofwords2vec (Vocablist, inputset): retvocablist = [0] * len ( Vocablist) for word in inputSet: if word in vocabList: retvocablist[vocablist.index (wORD)] = 1 else: print ' word ', word , ' Not in dict ' return retvocablist# here is the number of occurrences of each sample def bagofwords2vecmn (Vocablist, inputset): returnvec = [0]*len (vocablist) for word in inputset: if word in vocablist: returnvec[vocablist.index (word)] += 1 return returnvec# brings in sample data and results, calculates the number of occurrences for a class # This computes the probability of each word appearing in different groups def trainnb0 ( Trainmatrix,traincatergory): numtraindoc = len (TrainMatrix) numwords = len (Trainmatrix[0]) pabusive = sum (trainCatergory) /float (Numtraindoc) #防止多个概率的成绩当中的一个为0 p0num = np.ones (numwords) p1num = np.ones (numwords) p0Denom = 2.0 p1denom = 2.0 for i in range (NUMTRAINDOC): if trainCatergory[i] == 1: p1Num +=trainMatrix[i] p1denom += sum (Trainmatrix[i]) else: p0num += Trainmatrix[i] p0denom += sum (Trainmatrix[i]) #处于精度的考虑, otherwise it is possible to limit to zero, because there may be too many items are 0 # Avoid errors caused by overflow and floating-point rounding p1vect&nbSp;= np.log (p1num/p1denom) p0vect = np.log (p0Num/p0Denom) return p0vect,p1vect,pabusive# here is the equivalent of log def classifynb (Vec2classify, p0vec,  P1VEC, PCLASS1): p1 = sum (Vec2classify * p1vec) + np.log (PCLASS1) p0 = sum (Vec2classify * p0vec) + Np.log (1.0 - PCLASS1) if p1 > p0: return 1 else: return 0# test Method DEF TESTINGNB (): listoposts,listclasses = Loaddataset () myvocablist = createvocablist (listoposts) trainMat=[] for postinDoc in listOPosts: trainmat.appenD (Setofwords2vec (Myvocablist, postindoc)) p0v,p1v,pab = trainnb0 ( Np.array (Trainmat), Np.array (listclasses)) testentry = [' love ', ' my ', ' dalmation '] thisdoc = np.array (Setofwords2vec (myVocabList, testentry)) print testentry, ' classified as: ', ClassifyNB (ThisDoc,p0V,p1V , pAb) testentry = [' stupid ', ' garbage '] thisdoc = np.array (Setofwords2vec (myvocablist, testentry)) print testentry , ' classified as: ', CLASSIFYNB (THISDOC,P0V,P1V,PAB) Def main ():     TESTINGNB ( ) if __name__ == ' __main__ ': main ()
Study Log---Naive Bayesian algorithm