Knn's Python Code

Last Update:2016-10-28 Source: Internet

Author: User

Tags abs

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Import HEAPQ
Import Random

Class Classifier:
def __init__ (self, bucketprefix, testbucketnumber, dataformat, k):

"" "a classifier'll be built from files with the Bucketprefix
Excluding the file with Textbucketnumber. DataFormat is a string that
Describes how to interpret all line of the data files. For example,
For the MPG data, the format is:

"classnumnumnumnumnumcomment"
"""

Self.mediananddeviation = []
SELF.K = k
# reading the data in from the file

Self.format = Dataformat.strip (). split (' \ t ')
Self.data = []
# for each of the buckets numbered 1 through 10:
For I in range (1, 11):
# If it isn't the bucket we should ignore, read in the data
If i! = Testbucketnumber:
filename = "%s-%02i"% (bucketprefix, i)
f = Open (filename)
lines = F.readlines ()
F.close ()
For line in Lines[1:]:
Fields = Line.strip (). split (' \ t ')
Ignore = []
Vector = []
For I in range (len):

if self.format[i] = = ' num ':
Vector.append (float (fields[i]))
elif self.format[i] = = ' Comment ':
Ignore.append (fields[i])
elif self.format[i] = = ' Class ':
Classification = fields[i]
Self.data.append (classification, vector, Ignore)
Self.rawdata = List (self.data)
# get length of instance vector
Self.vlen = Len (self.data[0][1])
# now normalize the data
For I in range (self.vlen):
Self.normalizecolumn (i)

##################################################
###
# # # CODE to COMPUTE the MODIFIED standard score

def Getmedian (self, alist):
"" "return median of alist" ""
If alist = = []:
Return []
Blist = sorted (alist)
length = Len (alist)
If length% 2 = = 1:
# Length of list is odd so return middle element
return Blist[int ((((length + 1)/2)-1)]
Else
# Length of list is even so compute midpoint
V1 = Blist[int (length/2)]
V2 =blist[(int (length/2)-1)]
return (V1 + v2)/2.0

def getabsolutestandarddeviation (self, alist, median):
"" "given alist and median return absolute standard deviation" ""
sum = 0
For item in Alist:
Sum + = ABS (item-median)
return Sum/len (alist)

def normalizecolumn (self, columnnumber):
"" "given A column number, normalize that column in self.data" ""
# First Extract values to list
col = [v[1][columnnumber] for V in self.data]
Median = Self.getmedian (col)
ASD = self.getabsolutestandarddeviation (col, Median)
#print ("Median:%f ASD =%f"% (Median, ASD))
Self.medianAndDeviation.append ((median, ASD))
For V in Self.data:
v[1][columnnumber] = (V[1][COLUMNNUMBER]-MEDIAN)/ASD

def normalizevector (self, v):
"" We have stored the median and ASD for each column.
We now use them to normalize vector v "" "
vector = List (v)
For I in range (len (vector)):
(median, Asd) = self.mediananddeviation[i]
vector[i] = (VECTOR[I]-MEDIAN)/ASD
return vector
###
# # # END Normalization
##################################################

def testbucket (self, bucketprefix, bucketnumber):
"" "Evaluate the classifier with data from the file
Bucketprefix-bucketnumber "" "

filename = "%s-%02i"% (bucketprefix, bucketnumber)
f = Open (filename)
lines = F.readlines ()
Totals = {}
F.close ()
For line in Lines:
data = Line.strip (). split (' \ t ')
Vector = []
Classincolumn =-1
For I in range (len (self.format)):
if self.format[i] = = ' num ':
Vector.append (float (data[i]))
elif self.format[i] = = ' Class ':
Classincolumn = i
Therealclass = data[classincolumn]
#print ("REAL", Therealclass)
Classifiedas = Self.classify (vector)
Totals.setdefault (therealclass, {})
Totals[therealclass].setdefault (classifiedas, 0)
totals[therealclass][classifiedas] + = 1
return totals

def Manhattan (self, vector1, vector2):
"" "computes the Manhattan distance." ""
return sum (map (lambda v1, v2:abs (v1-v2), vector1, vector2))

def KNN (self, itemvector):
"" "returns the predicted class of Itemvector using K
Nearest Neighbors "" "
# changed from Min to Heapq.nsmallest to get the
# k Closest Neighbors
Neighbors = heapq.nsmallest (self.k,[(self.manhattan (itemvector, item[1]), Item)
For item in Self.data])
# Each neighbor gets a vote
Results = {}
For neighbor in Neighbors:
Theclass = neighbor[1][0]
Results.setdefault (theclass, 0)
results[theclass] + = 1
Resultlist = sorted ([(i[1], i[0]) for i in Results.items ()], Reverse=true)
#get all the classes, that has the maximum votes
Maxvotes = resultlist[0][0]
Possibleanswers = [i[1] for i in resultlist if i[0] = = maxvotes]
# Randomly select one of the classes that received the max votes
Answer = Random.choice (possibleanswers)
Return (answer)

def classify (self, itemvector):
"" "Return class We think item Vector is in" ""
# k represents how many nearest neighbors to use
return (SELF.KNN (self.normalizevector (itemvector)))

def tenfold (bucketprefix, dataformat, k):
Results = {}
For I in range (1, 11):
c = Classifier (bucketprefix, i, dataformat, k)
t = C.testbucket (bucketprefix, I)
For (key, Value) in T.items ():
Results.setdefault (key, {})
For (ckey, Cvalue) in Value.items ():
Results[key].setdefault (ckey, 0)
results[key][ckey] + = Cvalue

# now Print Results
Categories = List (results.keys ())
Categories.sort ()
Print ("\ n Classified As:")
Header = ""
Subheader = "+"
For category in Categories:
Header + = "% 2s"% category
Subheader + = "-----+"
Print (header)
Print (subheader)
Total = 0.0
correct = 0.0
For category in Categories:
row = "%s |"% category
For C2 in Categories:
If C2 in Results[category]:
Count = results[category][c2]
Else
Count = 0
Row + = "%3i |"% count
Total + = Count
if C2 = = Category:
Correct + = Count
Print (row)
Print (subheader)
Print ("\n%5.3f percent correct"% ((correct * +)/total)
Print ("total of%i instances" percent Total)

Print ("SMALL DATA SET")
Tenfold ("pimasmall/pimasmall",
"numnumnumnumnumnumnumnumclass", 1)
Print ("\n\nlarge DATA SET")

Tenfold ("pima/pima",
"numnumnumnumnumnumnumnumclass", 1)

# # #tenfold ("mpgdata/mpgdata", "classnumnumnumnumnumcomment")

Knn's Python Code

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More