---restore content starts---
"Machine learning" is indeed a learning Python, mastering data-related skills, a rare good book!!
Nearest neighbor algorithm source code is as follows, for the need of beginners to learn, the great god please detour.
Digital identification files
" "Created on Sep, 2010knn:k Nearest NeighborsInput:inX:vector to compare to existing dataset (1xN) Dataset:size m data set of known vectors (NxM) labels:data set labels (1xM vector) K:number of n Eighbors to comparison (should is an odd number) Output:the most popular class Label@author:pbhar Rin" " fromNumPyImport*Importoperator fromOsImportListdirdefclassify0 (InX, DataSet, labels, k): Datasetsize=Dataset.shape[0] Diffmat= Tile (InX, (datasetsize,1))-DataSet Sqdiffmat= Diffmat**2sqdistances= Sqdiffmat.sum (Axis=1) Distances= sqdistances**0.5sorteddistindicies=distances.argsort () ClassCount={} forIinchRange (k): Voteilabel=Labels[sorteddistindicies[i]] Classcount[voteilabel]= Classcount.get (voteilabel,0) + 1Sortedclasscount= Sorted (Classcount.iteritems (), Key=operator.itemgetter (1), reverse=True)returnSortedclasscount[0][0]defCreateDataSet (): Group= Array ([[[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels= ['A','A','B','B'] returnGroup, LabelsdefFile2matrix (filename): Fr=open (filename) numberoflines= Len (Fr.readlines ())#get The number of lines in the fileReturnmat = Zeros ((numberoflines,3))#prepare matrix to returnClasslabelvector = []#Prepare labels returnFR =open (filename) index=0 forLineinchfr.readlines (): line=Line.strip () listfromline= Line.split ('\ t') Returnmat[index,:]= Listfromline[0:3] # Read the top three attribute values classlabelvector.append (int (listfromline[-1]) # reads the class label index+ = 1returnReturnmat,classlabelvectordefAutonorm (dataSet): Minvals=dataset.min (0) maxvals=Dataset.max (0) ranges= Maxvals-minvals Normdataset=zeros (Shape (dataSet)) m=Dataset.shape[0] Normdataset= Dataset-tile (Minvals, (m,1)) Normdataset= Normdataset/tile (ranges, (m,1))#element wise divide returnnormdataset, ranges, minvalsdefdatingclasstest (): HoRatio= 0.50#Hold out 10%Datingdatamat,datinglabels = File2matrix ('DatingTestSet2.txt')#Load Data setfrom fileNormmat, ranges, minvals =autonorm (Datingdatamat) m=Normmat.shape[0] Numtestvecs= Int (m*hoRatio) Errorcount= 0.0 forIinchRange (numtestvecs): Classifierresult= Classify0 (normmat[i,:],normmat[numtestvecs:m,:],datinglabels[numtestvecs:m],3) Print "The classifier came back with:%d, the real answer is:%d"%(Classifierresult, datinglabels[i])if(Classifierresult! = Datinglabels[i]): Errorcount + = 1.0Print "The total error rate is:%f"% (errorcount/float (numtestvecs))PrintErrorcount
Handwritten digit recognition
# parsing Text data
def img2vector (filename):
Returnvect = Zeros ((1,1024)) fr = open (filename) for i in range (+): linestr = Fr.readline ( ) for J in range (+): returnvect[0,32*i+j] = int (linestr[j]) return returnvect
# test
def handwritingclasstest ():
Hwlabels=[] trainingfilelist= Listdir ('trainingdigits')#load the training setm =Len (trainingfilelist) Trainingmat= Zeros ((m,1024)) forIinchRange (m): Filenamestr=Trainingfilelist[i] Filestr= Filenamestr.split ('.') [0]#Take off. txtclassnumstr = Int (Filestr.split ('_') [0]) hwlabels.append (CLASSNUMSTR) trainingmat[i,:]= Img2vector ('trainingdigits/%s'%filenamestr) Testfilelist= Listdir ('testdigits')#iterate through the test setErrorcount = 0.0mtest=Len (testfilelist) forIinchRange (mtest): Filenamestr=Testfilelist[i] Filestr= Filenamestr.split ('.') [0]#Take off. txtclassnumstr = Int (Filestr.split ('_') [0]) Vectorundertest= Img2vector ('testdigits/%s'%filenamestr) Classifierresult= Classify0 (Vectorundertest, Trainingmat, Hwlabels, 3) Print "The classifier came back with:%d, the real answer is:%d"%(Classifierresult, CLASSNUMSTR)if(Classifierresult! = classnumstr): Errorcount + = 1.0Print "\nthe total number of errors is:%d"%ErrorcountPrint "\nthe total error rate is:%f"% (Errorcount/float (mtest))
Improving the pairing effect of dating sites using the K-Nearest neighbor algorithm