Import HEAPQ
Import Random
Class Classifier:
def __init__ (self, bucketprefix, testbucketnumber, dataformat, k):
"" "a classifier'll be built from files with the Bucketprefix
Excluding the file with Textbucketnumber. DataFormat is a string that
Describes how to interpret all line of the data files. For example,
For the MPG data, the format is:
"classnumnumnumnumnumcomment"
"""
Self.mediananddeviation = []
SELF.K = k
# reading the data in from the file
Self.format = Dataformat.strip (). split (' \ t ')
Self.data = []
# for each of the buckets numbered 1 through 10:
For I in range (1, 11):
# If it isn't the bucket we should ignore, read in the data
If i! = Testbucketnumber:
filename = "%s-%02i"% (bucketprefix, i)
f = Open (filename)
lines = F.readlines ()
F.close ()
For line in Lines[1:]:
Fields = Line.strip (). split (' \ t ')
Ignore = []
Vector = []
For I in range (len):
if self.format[i] = = ' num ':
Vector.append (float (fields[i]))
elif self.format[i] = = ' Comment ':
Ignore.append (fields[i])
elif self.format[i] = = ' Class ':
Classification = fields[i]
Self.data.append (classification, vector, Ignore)
Self.rawdata = List (self.data)
# get length of instance vector
Self.vlen = Len (self.data[0][1])
# now normalize the data
For I in range (self.vlen):
Self.normalizecolumn (i)
##################################################
###
# # # CODE to COMPUTE the MODIFIED standard score
def Getmedian (self, alist):
"" "return median of alist" ""
If alist = = []:
Return []
Blist = sorted (alist)
length = Len (alist)
If length% 2 = = 1:
# Length of list is odd so return middle element
return Blist[int ((((length + 1)/2)-1)]
Else
# Length of list is even so compute midpoint
V1 = Blist[int (length/2)]
V2 =blist[(int (length/2)-1)]
return (V1 + v2)/2.0
def getabsolutestandarddeviation (self, alist, median):
"" "given alist and median return absolute standard deviation" ""
sum = 0
For item in Alist:
Sum + = ABS (item-median)
return Sum/len (alist)
def normalizecolumn (self, columnnumber):
"" "given A column number, normalize that column in self.data" ""
# First Extract values to list
col = [v[1][columnnumber] for V in self.data]
Median = Self.getmedian (col)
ASD = self.getabsolutestandarddeviation (col, Median)
#print ("Median:%f ASD =%f"% (Median, ASD))
Self.medianAndDeviation.append ((median, ASD))
For V in Self.data:
v[1][columnnumber] = (V[1][COLUMNNUMBER]-MEDIAN)/ASD
def normalizevector (self, v):
"" We have stored the median and ASD for each column.
We now use them to normalize vector v "" "
vector = List (v)
For I in range (len (vector)):
(median, Asd) = self.mediananddeviation[i]
vector[i] = (VECTOR[I]-MEDIAN)/ASD
return vector
###
# # # END Normalization
##################################################
def testbucket (self, bucketprefix, bucketnumber):
"" "Evaluate the classifier with data from the file
Bucketprefix-bucketnumber "" "
filename = "%s-%02i"% (bucketprefix, bucketnumber)
f = Open (filename)
lines = F.readlines ()
Totals = {}
F.close ()
For line in Lines:
data = Line.strip (). split (' \ t ')
Vector = []
Classincolumn =-1
For I in range (len (self.format)):
if self.format[i] = = ' num ':
Vector.append (float (data[i]))
elif self.format[i] = = ' Class ':
Classincolumn = i
Therealclass = data[classincolumn]
#print ("REAL", Therealclass)
Classifiedas = Self.classify (vector)
Totals.setdefault (therealclass, {})
Totals[therealclass].setdefault (classifiedas, 0)
totals[therealclass][classifiedas] + = 1
return totals
def Manhattan (self, vector1, vector2):
"" "computes the Manhattan distance." ""
return sum (map (lambda v1, v2:abs (v1-v2), vector1, vector2))
def KNN (self, itemvector):
"" "returns the predicted class of Itemvector using K
Nearest Neighbors "" "
# changed from Min to Heapq.nsmallest to get the
# k Closest Neighbors
Neighbors = heapq.nsmallest (self.k,[(self.manhattan (itemvector, item[1]), Item)
For item in Self.data])
# Each neighbor gets a vote
Results = {}
For neighbor in Neighbors:
Theclass = neighbor[1][0]
Results.setdefault (theclass, 0)
results[theclass] + = 1
Resultlist = sorted ([(i[1], i[0]) for i in Results.items ()], Reverse=true)
#get all the classes, that has the maximum votes
Maxvotes = resultlist[0][0]
Possibleanswers = [i[1] for i in resultlist if i[0] = = maxvotes]
# Randomly select one of the classes that received the max votes
Answer = Random.choice (possibleanswers)
Return (answer)
def classify (self, itemvector):
"" "Return class We think item Vector is in" ""
# k represents how many nearest neighbors to use
return (SELF.KNN (self.normalizevector (itemvector)))
def tenfold (bucketprefix, dataformat, k):
Results = {}
For I in range (1, 11):
c = Classifier (bucketprefix, i, dataformat, k)
t = C.testbucket (bucketprefix, I)
For (key, Value) in T.items ():
Results.setdefault (key, {})
For (ckey, Cvalue) in Value.items ():
Results[key].setdefault (ckey, 0)
results[key][ckey] + = Cvalue
# now Print Results
Categories = List (results.keys ())
Categories.sort ()
Print ("\ n Classified As:")
Header = ""
Subheader = "+"
For category in Categories:
Header + = "% 2s"% category
Subheader + = "-----+"
Print (header)
Print (subheader)
Total = 0.0
correct = 0.0
For category in Categories:
row = "%s |"% category
For C2 in Categories:
If C2 in Results[category]:
Count = results[category][c2]
Else
Count = 0
Row + = "%3i |"% count
Total + = Count
if C2 = = Category:
Correct + = Count
Print (row)
Print (subheader)
Print ("\n%5.3f percent correct"% ((correct * +)/total)
Print ("total of%i instances" percent Total)
Print ("SMALL DATA SET")
Tenfold ("pimasmall/pimasmall",
"numnumnumnumnumnumnumnumclass", 1)
Print ("\n\nlarge DATA SET")
Tenfold ("pima/pima",
"numnumnumnumnumnumnumnumclass", 1)
# # #tenfold ("mpgdata/mpgdata", "classnumnumnumnumnumcomment")
Knn's Python Code