#一, write your own KNN.
Df<-read.csv (' G:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\10-recommendations\\data\\example_ Data.csv ')
Head (DF)
#得出距离矩阵
Distance.matrix <-function (DF)
{
#生成一万个NA, and turn into a matrix of 100*100
Distance <-Matrix (Rep (NA, Nrow (DF) ^ 2), Nrow = Nrow (DF))
#计算两两之间的欧氏距离
For (I-in 1:nrow (DF))
{
For (J in 1:nrow (DF))
{
Distance[i, J] <-sqrt ((df[i, ' x ']-df[j, ' x ']) ^ 2 + (df[i, ' y ']-df[j, ' y ']) ^ 2)
}
}
return (distance)
}
#查找与数据点i距离最短的前k个点
K.nearest.neighbors <-function (i, distance, k = 5)
{
#distance [I,] is the distance between all points and point I, order, take K subscript, starting from 2 is the 1th position is the data point I
Return (Order (distance[i,]) [2: (k + 1)])
}
#得出预测值
KNN <-function (df, k = 5)
{
#得出距离矩阵
Distance <-Distance.matrix (DF)
#predictions存NA
Predictions <-Rep (NA, Nrow (DF))
For (I-in 1:nrow (DF))
{
#得出与i最近的K个点的下标
Indices <-k.nearest.neighbors (i, distance, k = k)
#均值大于0.5 assigns 1. otherwise 0
Predictions[i] <-ifelse (Mean (df[indices, ' Label ')) > 0.5, 1, 0)
}
return (predictions)
}
#添加预测列
DF <-Transform (df, knnpredictions = KNN (DF))
#以下是计算预测错误的个数, a total of 7, a total of 100, the correct rate is 93%
SUM (with (DF, Label! = knnpredictions))
#把刚才自己写的KNN函数删除
RM (' KNN ')
#二, the following is the function in R to do KNN
Library (' class ')
Df<-read.csv (' G:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\10-recommendations\\data\\example_ Data.csv ')
N <-nrow (DF)
Set.seed (1)
#从1到n中随机抽一半作为训练集, the rest is the test set
Indices <-Sort (sample (1:n, n * (1/2)))
training.x <-df[indices, 1:2]
Test.x <-df[-indices, 1:2]
Training.y <-df[indices, 3]
Test.y <-df[-indices, 3]
# there ' s a bug here!
Predicted.y <-KNN (training.x, Test.x, training.y, k = 5)
#预测错了7个点, but the test set is only 50 observations, so the correct rate is 86%.
SUM (predicted.y! = test.y)
#下面看看逻辑回归
Logit.model <-GLM (Label ~ X + Y, data = df[indices,])
Predictions <-as.numeric (Predict (logit.model, NewData = df[-indices,]) > 0)
SUM (predictions! = TEST.Y)
#结果是50行预测错了16个点, the accuracy rate is only 68%, so the conclusion is that if the problem is not linear at all, K-nearest neighbor behaves better than GLM.
#三, the following recommended cases, using kaggle data, according to a programmer has installed the package to predict whether the programmer will install another package
Installations <-read.csv (' g:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\10-recommendations\\data\\ Installations.csv ')
Head (installations)
Library (' reshape ')
#数据集中共三列, respectively, is package,user,installed.
#cast函数的作用: Data in DataSet, user as row, package as column, value for install
#结果矩阵中, the first column is the user name
User.package.matrix <-cast (installations, User ~ package, value = ' installed ')
Row.names (User.package.matrix) <-user.package.matrix[, 1]
User.package.matrix <-user.package.matrix[,-1]
#计算一下相关性
Similarities <-Cor (User.package.matrix)
#把相似度转换为距离矩阵, the similarity degree 1 is converted to distance 0, and the similarity-1 is converted to distance infinity
Distances <--log ((SIMILARITIES/2) + 0.5)
#返回与数据点i最近的K个点的下标
K.nearest.neighbors <-function (i, distances, k = 25)
{
Return (Order (distances[i,]) [2: (k + 1)])
}
Installation.probability <-function (user, package, User.package.matrix, distances, k = 25)
{
Neighbors <-k.nearest.neighbors (package, distances, k = k)
Return (Mean (sapply (neighbors, function (neighbor) {user.package.matrix[user, neighbor]}))
}
#对于用户1有多少概率安装程序包1
Installation.probability (1, 1, User.package.matrix, distances)
#计算出用户最可能安装的程序包, sort by probability
Most.probable.packages <-function (user, User.package.matrix, distances, k = 25)
{
Return (Sapply (1:ncol (User.package.matrix),
Function (Package)
{
Installation.probability (User,
Package
User.package.matrix,
Distances,
K = k)
}),
decreasing = TRUE))
}
User <-1
Listing <-most.probable.packages (user, User.package.matrix, distances)
Colnames (User.package.matrix) [Listing[1:10]
Machine learning for hackers reading notes (10) KNN: Referral system