Knn's Python Code

Source: Internet
Author: User
Tags abs

Import HEAPQ
Import Random

Class Classifier:
def __init__ (self, bucketprefix, testbucketnumber, dataformat, k):

"" "a classifier'll be built from files with the Bucketprefix
Excluding the file with Textbucketnumber. DataFormat is a string that
Describes how to interpret all line of the data files. For example,
For the MPG data, the format is:

"classnumnumnumnumnumcomment"
"""

Self.mediananddeviation = []
SELF.K = k
# reading the data in from the file

Self.format = Dataformat.strip (). split (' \ t ')
Self.data = []
# for each of the buckets numbered 1 through 10:
For I in range (1, 11):
# If it isn't the bucket we should ignore, read in the data
If i! = Testbucketnumber:
filename = "%s-%02i"% (bucketprefix, i)
f = Open (filename)
lines = F.readlines ()
F.close ()
For line in Lines[1:]:
Fields = Line.strip (). split (' \ t ')
Ignore = []
Vector = []
For I in range (len):

if self.format[i] = = ' num ':
Vector.append (float (fields[i]))
elif self.format[i] = = ' Comment ':
Ignore.append (fields[i])
elif self.format[i] = = ' Class ':
Classification = fields[i]
Self.data.append (classification, vector, Ignore)
Self.rawdata = List (self.data)
# get length of instance vector
Self.vlen = Len (self.data[0][1])
# now normalize the data
For I in range (self.vlen):
Self.normalizecolumn (i)

##################################################
###
# # # CODE to COMPUTE the MODIFIED standard score

def Getmedian (self, alist):
"" "return median of alist" ""
If alist = = []:
Return []
Blist = sorted (alist)
length = Len (alist)
If length% 2 = = 1:
# Length of list is odd so return middle element
return Blist[int ((((length + 1)/2)-1)]
Else
# Length of list is even so compute midpoint
V1 = Blist[int (length/2)]
V2 =blist[(int (length/2)-1)]
return (V1 + v2)/2.0

def getabsolutestandarddeviation (self, alist, median):
"" "given alist and median return absolute standard deviation" ""
sum = 0
For item in Alist:
Sum + = ABS (item-median)
return Sum/len (alist)


def normalizecolumn (self, columnnumber):
"" "given A column number, normalize that column in self.data" ""
# First Extract values to list
col = [v[1][columnnumber] for V in self.data]
Median = Self.getmedian (col)
ASD = self.getabsolutestandarddeviation (col, Median)
#print ("Median:%f ASD =%f"% (Median, ASD))
Self.medianAndDeviation.append ((median, ASD))
For V in Self.data:
v[1][columnnumber] = (V[1][COLUMNNUMBER]-MEDIAN)/ASD


def normalizevector (self, v):
"" We have stored the median and ASD for each column.
We now use them to normalize vector v "" "
vector = List (v)
For I in range (len (vector)):
(median, Asd) = self.mediananddeviation[i]
vector[i] = (VECTOR[I]-MEDIAN)/ASD
return vector
###
# # # END Normalization
##################################################

def testbucket (self, bucketprefix, bucketnumber):
"" "Evaluate the classifier with data from the file
Bucketprefix-bucketnumber "" "

filename = "%s-%02i"% (bucketprefix, bucketnumber)
f = Open (filename)
lines = F.readlines ()
Totals = {}
F.close ()
For line in Lines:
data = Line.strip (). split (' \ t ')
Vector = []
Classincolumn =-1
For I in range (len (self.format)):
if self.format[i] = = ' num ':
Vector.append (float (data[i]))
elif self.format[i] = = ' Class ':
Classincolumn = i
Therealclass = data[classincolumn]
#print ("REAL", Therealclass)
Classifiedas = Self.classify (vector)
Totals.setdefault (therealclass, {})
Totals[therealclass].setdefault (classifiedas, 0)
totals[therealclass][classifiedas] + = 1
return totals

def Manhattan (self, vector1, vector2):
"" "computes the Manhattan distance." ""
return sum (map (lambda v1, v2:abs (v1-v2), vector1, vector2))




def KNN (self, itemvector):
"" "returns the predicted class of Itemvector using K
Nearest Neighbors "" "
# changed from Min to Heapq.nsmallest to get the
# k Closest Neighbors
Neighbors = heapq.nsmallest (self.k,[(self.manhattan (itemvector, item[1]), Item)
For item in Self.data])
# Each neighbor gets a vote
Results = {}
For neighbor in Neighbors:
Theclass = neighbor[1][0]
Results.setdefault (theclass, 0)
results[theclass] + = 1
Resultlist = sorted ([(i[1], i[0]) for i in Results.items ()], Reverse=true)
#get all the classes, that has the maximum votes
Maxvotes = resultlist[0][0]
Possibleanswers = [i[1] for i in resultlist if i[0] = = maxvotes]
# Randomly select one of the classes that received the max votes
Answer = Random.choice (possibleanswers)
Return (answer)

def classify (self, itemvector):
"" "Return class We think item Vector is in" ""
# k represents how many nearest neighbors to use
return (SELF.KNN (self.normalizevector (itemvector)))


def tenfold (bucketprefix, dataformat, k):
Results = {}
For I in range (1, 11):
c = Classifier (bucketprefix, i, dataformat, k)
t = C.testbucket (bucketprefix, I)
For (key, Value) in T.items ():
Results.setdefault (key, {})
For (ckey, Cvalue) in Value.items ():
Results[key].setdefault (ckey, 0)
results[key][ckey] + = Cvalue

# now Print Results
Categories = List (results.keys ())
Categories.sort ()
Print ("\ n Classified As:")
Header = ""
Subheader = "+"
For category in Categories:
Header + = "% 2s"% category
Subheader + = "-----+"
Print (header)
Print (subheader)
Total = 0.0
correct = 0.0
For category in Categories:
row = "%s |"% category
For C2 in Categories:
If C2 in Results[category]:
Count = results[category][c2]
Else
Count = 0
Row + = "%3i |"% count
Total + = Count
if C2 = = Category:
Correct + = Count
Print (row)
Print (subheader)
Print ("\n%5.3f percent correct"% ((correct * +)/total)
Print ("total of%i instances" percent Total)

Print ("SMALL DATA SET")
Tenfold ("pimasmall/pimasmall",
"numnumnumnumnumnumnumnumclass", 1)
Print ("\n\nlarge DATA SET")

Tenfold ("pima/pima",
"numnumnumnumnumnumnumnumclass", 1)

# # #tenfold ("mpgdata/mpgdata", "classnumnumnumnumnumcomment")

Knn's Python Code

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.