From NumPy import *# Create an experimental sample Def loaddataset (): Postinglist = [[' My ', ' dog ', ' has ', ' flea ', ' problems ', ' help ', ' please '], [' Maybe ', ' not ', ' take ', ' him ', ' to ', ' dog ', ' Park ', ' stupid '], [' my ', ' dalmation ', ' are ', ' so ', ' cu ' Te ', ' I ', ' love ', ' him '], [' Stop ', ' posting ', ' stupid ', ' worthless ', ' garbage '], [' Mr ', ' licks ' Ate ', ' my ', ' steak ', ' How ', ' to ', ' stop ', ' him '], [' Quit ', ' buying ', ' worthless ', ' dog ', ' food ', ' stupid ']] C Lassvec = [0,1,0,1,0,1] return postinglist, Classvec# creates a list of non-repeating words that appear in all Documents Def createvocablist (dataSet): Vocabset = Set ([]) #创建一个空集 for document in Dataset:vocabset = Vocabset | Set (document) #创建两个集合的并集 return list (vocabset) #将文档词条转换成词向量def Setofwords2vec (Vocablist, inputset): Returnvec = [0] *len (vocablist) #创建一个其中所含元素都为0的向量 for word in Inputset:if word in vocablist:returnvec[vocabl Ist.index (word)] = 1 #index函数在字符串里找到字符第一次出现的位置 Word set model #returnVec [Vocablist.index (word)] + = 1 #文档的词袋模型 Each word can appear multiple times Else:print "The word:%s is isn't in my vocabula ry! "% word return returnvec# naive Bayesian classifier training function from Word vector computing probability def trainNB0 (Trainmatrix, traincategory): Numtraindocs = Len (tra Inmatrix) numwords = Len (trainmatrix[0]) pabusive = SUM (traincategory)/float (numtraindocs) #p0Num = Zeros (Numword s); P1num = Zeros (numwords) #p0Denom = 0.0; P1denom = 0.0 p0num = ones (numwords); P1num = Ones (numwords) #避免一个概率值为0, the last product is also 0 p0denom = 2.0; P1denom = 2.0 for I in Range (Numtraindocs): if traincategory[i] = = 1:p1num + = Trainmatrix[i] #print "------------\ n" #print p1num p1denom + = SUM (Trainmatrix[i]) #print "+++++++++++ ++\n "#print p1denom else:p0num + trainmatrix[i] p0denom + = SUM (Trainmatrix[i]) # p1vect = P1num/p1denom #p0Vect = p0num/p0denom p1vect = log (p1num/p1denom) p0vect = log (p0nuM/p0denom) #避免下溢出或者浮点数舍入导致的错误 Overflow is a return p0vect that is multiplied by too many small numbers, p1vect, pabusive# naive Bayes classifier def CLASSIFYNB (Vec2clas Sify, P0vec, P1vec, pClass1): P1 = sum (Vec2classify*p1vec) + log (pClass1) P0 = SUM (Vec2classify*p0vec) + log (1.0-PCL ASS1) If p1 > P0:return 1 else:return 0 listoposts, listclasses = Loaddataset () myvocablist = Createvocab List (listoposts) Trainmat = []for postindoc in ListOPosts:trainMat.append (Setofwords2vec (Myvocablist, Postindoc)) p0v, P1V, pAb = trainNB0 (Array (trainmat), Array (listclasses)) testentry = [' stupid ', ' garbage ']thisdoc = Array (Setofwords2vec (Myvocablist, Testentry)) Print Testentry, ' classified as: ', CLASSIFYNB (Thisdoc, p0v, P1V, PAb)
Machine learning combat--naive Bayesian