# KNN Algorithm Ideas:
#-----------------------------------------------------#
#step1: Read-in data, stored as a linked list
#step2: Data preprocessing, including missing value processing, normalization, etc.
#step3: Set K value
#step4: Calculates the distance between the sample to be tested and all samples (binary, ordinal, continuous)
#step5: Voting determines the type of sample to be tested
#step6: Test the correct rate with a test set
#-----------------------------------------------------#
Note: Because it is a beginner of Python, perhaps a lot of advanced usage will not, so the Python code like C also please do not spit groove. At the same time, we hope to point out that the mistakes and areas to be improved, we all progress together is the best.
Description: Datasets collected from the renowned UCI data Set library http://archive.ics.uci.edu/ml/datasets/adult
# author :cwx# date :2015/9/1# function: a classifier which using knn algorithm import mathattributes = {"Age": 0, "Workclass": 1, "FNLWG": 2, " Education ": 3," Education-num ":4, " Marital-status ": 5," occupation ": 6," relationship ": 7," race ":8, " Sex ": 9 , "Capital-gain": Ten, "Capital-loss": One, "Hours-per-week":12, "native-country": "Salary": 14}def read_ TXT (filename): #read &NBSP;DATA&NBSP;AND&NBSP;CONVERT&NBSP;IT&NBSP;INTO&NBSP;LIST&NBSP;ITEMS&NBSP;=&NBSP;[]FP = open (filename, ' R ') Lines = fp.readlines () for line in lines:line = line.strip (' \ n ') items.append (line) fp.close () I = 0b = []for i in range (Len (items)): B.append (Items[i].split (', ')) Return bdef computena (items): # detect missing value in list and handle it# items - an whole list for item in&nbsP;items[:]:if item.count (' ? ') > 0:items.remove (item) # if item.count (' ? ') >= -1:# items.remove (item) return itemsdef discal (Lst1,lst2,type):# calculting distance between lst1 and lst2distance = 0;if type == "Manhattan" or type == "Manhattan": For i in range (Len (lst2) - 1):d Istance += abs (Lst1[i] - lst2[i]) elif type == "Elucildean" or type == "Elucildean": For i in range (Len (lst2) - 1):d istance += math.sqrt ((Lst1[i] - lst2[i]) **2) else:print "Error in type name" distance = -1return distancedef computecontinous (datalist,attribute): # compute continous attributes in listmin_val = int (Datalist[0][attribute]) Max_val = int ( Datalist[0][attribute]) FOR&NBSp;items in datalist:if int (Items[attribute]) < min_val:min_val = int ( Items[attribute]) Elif int (Items[attribute]) > max_val:max_val = int (items[ Attribute]) for items in datalist[:]:items[attribute] = (int (items[attribute)) - min_val) / float (max_val - min_val) return datalistdef computeordinal ( Datalist,attribute,level): # compute ordinal attribute in datalistlevel_dict = {}for i in range (len): Level_dict[level[i]] = float (i) / (Len ( Level) - 1) #level_dict [level[i]] = ifor items in datalist[:]:items[ Attribute] = level_dict[items[attribute]]return datalistdef knnalgorithm (DataTrain,sample, ATTRIBUTE,K): Mergedata = datatrainmergedata.append (sample) data = preprocessing (MergeData) distance = []for i In range (len (data)-2):d istance.append (discal (Data[i],data[len (data)-1], "Elucildean")) copy_dis = distance[:] # notice : not copy_dis = distance ,if it Will be wrongdistance.sort () class_dict = {"Yes": 0, "No": 0}for i in range (k): Index = copy_dis.index (Distance[i]) if data[index][attribute] == " >50K": Class _dict["yes"] += 1else:class_dict["No"] += 1if class_dict["yes"] > class_dict["No"]:p rint "Sample ' s salary >50k" else:print "Sample ' s salary < =50k "Def preprocessing (DataList): B = computena (DataList) b = computecontinous (b, Attributes["Age"]) workclass_level = [" private", " self-emp-not-inc", " Self-emp-inc", " federal-gov "," local-gov "," state-gov "," without-pay "," never-worked "]b = Computeordinal (b,attributes["Workclass"],workclass_level) b = computecontinous (b,attributes["FNLWG"]) education_level =[" bachelors", " some-college", " 11th", " hs-grad", " prof-school", " ASSOC-ACDM "," assoc-voc "," 9th "," 7th-8th "," 12th "," masters "," 1st-4th "," 10th "," doctorate "," 5th-6th "," preschool "] b = computeordinal (b,attributes[" Education "],education_level) b = computecontinous (b,attributes[" Education-num "]) marital_status_level = [" married-civ-spouse", " divorced", " never-married", " separated", " Widowed "," married-spouse-absent "," married-af-spouse "] b = computeordinal (b, attributes["Marital-status"],marital_status_level) occupation_level = [" Tech-support "," craft-repair "," other-service "," sales "," exec-managerial "," Prof-specialty "," handlers-cleaners ", " machine-op-inspct "," adm-clerical "," farming-fishing "," transport-moving "," Priv-house-serv "," protective-serv "," armed-forces "]b = computeordinal (b,attributes[" Occupation "],occupation_level" relationship_level = [" wife", " own-child", " Husband", " not-in-family "," other-relative "," unmarried "]b = computeordinal (b,attributes[" Relationship "],relationship_level" race_level = [" white", " asian-pac-islander", " Amer-indian-eskimo "," other "," black "]b = computeordinal (b,attributes[" race "],race_level) sex_level = [" female", " male"]b = computeordinal (b,attributes["Sex"],sex_ Level) b = computecontinous (b,attributes["Capital-gain") b = computecontinous (b,attributes[ "Capital-loss"]) b = computecontinous (b,attributes["Hours-per-week"]) native_country_level = [" united-states", " Cambodia "," england "," puerto-rico "," canada "," germany "," outlying-us "(GUAM-USVI-ETC) "," india "," japan "," greece "," south "," china "," cuba "," iran "," Honduras "," philippines "," italy "," poland "," jamaica "," vietnam "," mexico "," portugal "," ireland "," france "," dominican-republic "," laos "," ecuador "," Taiwan "," haiti "," columbia "," hungary "," guatemala "," nicaragua "," scotland "," thailand "," yugoslavia "," el-salvador "," trinadad&tobago "," peru "," Hong "," holand-netherlands "]b = computeordinal (b,attributes[" Native-country "],native_country_level) Return bdef assessment (datatrain,datatest,atrribute,k): Mergedata = computena (DataTrain) len _train = len (Mergedata) mergedata.extend (Computena (datatest)) data = preprocessing (mergeData ) len_test = Len (data) - len_trainres_dict = {"correct": 0, "wrong": 0}for i in range (Len_ Test):d istance = []class_dict = {"Yes": 0, "No": 0}for j in range (Len_train): Distance.append (discal (Data[j],data[i+len_train], "Elucildean")) Copy_dis = distance[:]distance.sort () For m in range (k): Index = copy_dis.index (Distance[m]) If data[index][atrribute] == " >50k": class_dict["Yes"] += 1else:class_dict["No"] += 1 if class_dict["Yes"] > class_dict["No"] and mergedata[i+len_train][atrribute] == " >50k": #Attention : in train data in the end of lines there is a "." res_dict["correct"] += 1elif mergedata[i+len_train][atrribute] == " <= 50K. " and class_dict["Yes"] < class_dict["No"]:res_dict["correct"] += 1else:res_dict["wrong"] += 1correct_ratio = float (Res_ dict["correct"]) / (res_dict["correct"] + res_dict["wrong"]) print "Correct_ratio = ",correct_ratio filename = " H:\BaiduYunDownload\AdultDatasets\Adult_data.txt "# sample = [" 80", " private", " 226802", " 11th", " 7", " never-married", " machine-op-inspct "," own-child "," black "," male "," 0 "," 0 "," 40 "," United-states "," <=50k "]sample = [" 65 "," private "," 184454 "," HS-grad ", " 9", " married-civ-spouse", " machine-op-inspct", " husband", " white", " Male", " 6418 "," 0 "," 40 "," united-states "," >50k "]# this samples salary <=50K## filename = "D:\MyDesktop-HnH\data.txt" a = read_txt (filename) print len (a) K = 3#knnalgoRithm (a,sample,attributes["Salary"],k) trainname = "H:\BaiduYunDownload\AdultDatasets\Adult_test.txt" traindata = read_txt (Trainname) #preProcessing (traindata) Assessment (a,traindata,attributes[" Salary "],k)
Result: Correct rate 0.812416998672
Run time 1 hours 20 minutes
This article is from "Lu Yao" blog, please be sure to keep this source http://cwxfly.blog.51cto.com/6113982/1691241
Implementation of KNN algorithm in Python