Machine learning for hackers reading notes (10) KNN: Referral System

Source: Internet
Author: User

#一, write your own KNN.

Df<-read.csv (' G:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\10-recommendations\\data\\example_ Data.csv ')
Head (DF)

#得出距离矩阵
Distance.matrix <-function (DF)
{

#生成一万个NA, and turn into a matrix of 100*100
Distance <-Matrix (Rep (NA, Nrow (DF) ^ 2), Nrow = Nrow (DF))

#计算两两之间的欧氏距离

For (I-in 1:nrow (DF))

{
For (J in 1:nrow (DF))
{
Distance[i, J] <-sqrt ((df[i, ' x ']-df[j, ' x ']) ^ 2 + (df[i, ' y ']-df[j, ' y ']) ^ 2)
}
}
return (distance)
}

#查找与数据点i距离最短的前k个点
K.nearest.neighbors <-function (i, distance, k = 5)
{

#distance [I,] is the distance between all points and point I, order, take K subscript, starting from 2 is the 1th position is the data point I
Return (Order (distance[i,]) [2: (k + 1)])
}

#得出预测值
KNN <-function (df, k = 5)
{

#得出距离矩阵
Distance <-Distance.matrix (DF)

#predictions存NA
Predictions <-Rep (NA, Nrow (DF))
For (I-in 1:nrow (DF))
{

#得出与i最近的K个点的下标
Indices <-k.nearest.neighbors (i, distance, k = k)

#均值大于0.5 assigns 1. otherwise 0
Predictions[i] <-ifelse (Mean (df[indices, ' Label ')) > 0.5, 1, 0)
}
return (predictions)
}

#添加预测列
DF <-Transform (df, knnpredictions = KNN (DF))

#以下是计算预测错误的个数, a total of 7, a total of 100, the correct rate is 93%
SUM (with (DF, Label! = knnpredictions))
#把刚才自己写的KNN函数删除
RM (' KNN ')

#二, the following is the function in R to do KNN

Library (' class ')
Df<-read.csv (' G:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\10-recommendations\\data\\example_ Data.csv ')
N <-nrow (DF)
Set.seed (1)

#从1到n中随机抽一半作为训练集, the rest is the test set
Indices <-Sort (sample (1:n, n * (1/2)))
training.x <-df[indices, 1:2]
Test.x <-df[-indices, 1:2]
Training.y <-df[indices, 3]
Test.y <-df[-indices, 3]
# there ' s a bug here!
Predicted.y <-KNN (training.x, Test.x, training.y, k = 5)

#预测错了7个点, but the test set is only 50 observations, so the correct rate is 86%.
SUM (predicted.y! = test.y)

#下面看看逻辑回归
Logit.model <-GLM (Label ~ X + Y, data = df[indices,])
Predictions <-as.numeric (Predict (logit.model, NewData = df[-indices,]) > 0)
SUM (predictions! = TEST.Y)

#结果是50行预测错了16个点, the accuracy rate is only 68%, so the conclusion is that if the problem is not linear at all, K-nearest neighbor behaves better than GLM.

#三, the following recommended cases, using kaggle data, according to a programmer has installed the package to predict whether the programmer will install another package

Installations <-read.csv (' g:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\10-recommendations\\data\\ Installations.csv ')
Head (installations)
Library (' reshape ')

#数据集中共三列, respectively, is package,user,installed.

#cast函数的作用: Data in DataSet, user as row, package as column, value for install

#结果矩阵中, the first column is the user name

User.package.matrix <-cast (installations, User ~ package, value = ' installed ')

Row.names (User.package.matrix) <-user.package.matrix[, 1]

User.package.matrix <-user.package.matrix[,-1]

#计算一下相关性

Similarities <-Cor (User.package.matrix)

#把相似度转换为距离矩阵, the similarity degree 1 is converted to distance 0, and the similarity-1 is converted to distance infinity
Distances <--log ((SIMILARITIES/2) + 0.5)

#返回与数据点i最近的K个点的下标

K.nearest.neighbors <-function (i, distances, k = 25)
{
Return (Order (distances[i,]) [2: (k + 1)])
}

Installation.probability <-function (user, package, User.package.matrix, distances, k = 25)
{
Neighbors <-k.nearest.neighbors (package, distances, k = k)
Return (Mean (sapply (neighbors, function (neighbor) {user.package.matrix[user, neighbor]}))
}

#对于用户1有多少概率安装程序包1

Installation.probability (1, 1, User.package.matrix, distances)

#计算出用户最可能安装的程序包, sort by probability

Most.probable.packages <-function (user, User.package.matrix, distances, k = 25)
{
Return (Sapply (1:ncol (User.package.matrix),
Function (Package)
{
Installation.probability (User,
Package
User.package.matrix,
Distances,
K = k)
}),
decreasing = TRUE))
}

User <-1

Listing <-most.probable.packages (user, User.package.matrix, distances)

Colnames (User.package.matrix) [Listing[1:10]

Machine learning for hackers reading notes (10) KNN: Referral system

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.