I. The overall structure of the recommendation system
1.1 Select User Preferences
Two. Open Source recommendation system
2.1 Collaborative filtering and its algorithm
(1) Data preprocessing and UI matrix: grouped as "view" and "buy", data preprocessing, noise reduction and normalization
(2) Recommended model: User CF and item CF: based on collaborative filtering of objects (Amason,netfix,hulu,youtube etc.) here, we use KNN nearest neighbor algorithm.
(1) User CF
#-*-Coding:utf-8-*-from
numpy import *
import NumPy as NP from
recommand_lib import KNN
Datamat=mat ([ [0.238,0,0.1905,0.1905,0.1905,0.1905],[0,0.177,0,0.294,0.235,0.294],[0.2,0.16,0.12,0.12,0.2,0.2]])
TestSet = [0.2174,0.2174,0.1304,0,0.2174,0.2174]
Classlabel = Np.array ([' B ', ' C ', ' D '])
print KNN (Testset,datamat, classlabel,3)
(1) Item CF
#-*-Coding:utf-8-*-from
numpy import *
import NumPy as NP from
recommand_lib import KNN
Datamat=mat ([ [0.417,0.0,0.25,0.333],[0.3,0.4,0.0,0.3],[0.0,0.0,0.625,0.375],[0.278,0.222,0.222,0.278],[ 0.263,0.211,0.263,0.263]]
testset = [0.334,0.333,0.0,0.333]
Classlabel = Np.array ([' B ', ' C ', ' D ', ' E ', ' F '])
print KNN (testset,datamat,classlabel,3)
Run the same result:
(3) Kmeans (Scikit-learn Library) Clustering Algorithm Computing similarity: The method of reducing computational quantity, the best choice is to cluster data. The algorithm simple implementation code is as follows:
#-*-Coding:utf-8-*-
# filename:02kmeans1.py
import time
import NumPy as NP from
recommand_lib Import * FROM
sklearn.cluster import kmeans
import Matplotlib.pyplot as plt
k = 4
DataSet = File2matrix ("Testda Ta/4k2_far.txt "," \ T ")
Datamat = Mat (dataset[:,1:]) # converted to matrix form
Kmeans = Kmeans (init= ' k-means++ ', n_clusters=4) C10/>kmeans.fit (Datamat)
# Output generated clustdist: the corresponding cluster center (column 1), to the cluster Center distance (column 2), row and dataset one by one corresponding
Drawscatter (PLT, Datamat,size=20,color= ' B ', mrkr= '. ')
# Draw Cluster Center
drawscatter (plt,kmeans.cluster_centers_,size=60,color= ' red ', mrkr= ' D ')
The results of the operation are as follows:
Algorithm improvement of clustering: Two-point Kmeans algorithm:
#-*-Coding:utf-8-*-# filename:02kmeans1.py from numpy import * Import NumPy as NP from recommand_lib import * impo RT Matplotlib.pyplot As PLT # Data set built from File Datamat = File2matrix ("Testdata/4k2_far.txt", "\ T") DataSet = Mat (datamat[: , 1:] # Convert to matrix form k = 4 # classification Number M = shape (DataSet) [0] # Initialize the first cluster Center: The mean value of each column CENTROID0 = Mean (DataSet, axis=0). ToList () [0] C Entlist =[CENTROID0] # put the mean cluster center into the Center table # Initialize the cluster distance table, distance variance: Clustdist = Mat (Zeros ((m,2))) for J in Range (m): clustdist[j,1] = Disteclud (Centroid0,dataset[j,:]) **2 # sequentially generates K cluster center while (LEN (centlist) < K): Lowestsse = inf # initializes the minimum error square sum.
Core parameters, the smaller the value of the clustering effect is better. # traversing each vector of Cenlist #----1. Use Clustdist to compute Lowestsse to determine: Bestcenttosplit, bestnewcents, bestclustass----# for I in Xrange (Len (centlist)): pts Incurrcluster = Dataset[nonzero (clustdist[:,0]. A==i) [0],:] # using the standard Kmeans algorithm (k=2), the Ptsincurrcluster is divided into two cluster centers, and the corresponding cluster distance table Centroidmat,splitclustass = Kmeans ( Ptsincurrcluster, 2) # Calculate Splitclustass distance FlatSquare and Ssesplit = SUM (splitclustass[:,1]) # Calculates the distance squared and ssenotsplit = sum of!=i clustdist[clustdist 1th column (clustd Ist[nonzero (clustdist[:,0]. a!=i) [0],1]) if (Ssesplit + ssenotsplit) < lowestsse: # algorithm formula: Lowestsse = ssesplit + ssenotsplit be
Stcenttosplit = i # to determine the optimal dividing point of a cluster center bestnewcents = centroidmat # Update the optimal cluster center with the new cluster center
Bestclustass = Splitclustass.copy () # Deep copy clustering distance table for optimal cluster distance table Lowestsse = ssesplit + ssenotsplit # update Lowestsse # Back to the outer loop #----2. Calculate the new clustdist----# # COMPUTE Bestclustass divided into two parts: # The first part is bestclustass[bindx0,0] The index of the cluster Center is assigned Bestclustass[nonzero (b estclustass[:,0]. A = = 1) [0],0] = Len (centlist) # The second section is indexed Bestclustass[nonzero (bestclustass[:,0) with the specified cluster center of the optimal dividing point.
A = = 0) [0],0] = bestcenttosplit # above for calculating Bestclustass # update clustdist the distance corresponding to the optimal dividing point, so that the distance value equals the value of the optimal cluster distance #以上为计算ClustDist #----3. Reconstructing the cluster center with the optimal dividing point----# # Overlay: bestnewcents[0,:].tolist () [0] Append to the original cluster center bestCenttosplit position # Add: Cluster center adds a new bestnewcents[1,:].tolist () [0] vector centlist[bestcenttosplit] = bestnewcents[0,:].tolist () [0] Centlist.append (bestnewcents[1,:].tolist () [0]) Clustdist[nonzero (clustdist[:,0]. A = = Bestcenttosplit) [0],:]= Bestclustass # above for calculating centlist color_cluster (clustdist[:,0:1],dataset,plt) print "CenList: ", Mat (centlist) # print" Clustdist: ", Clustdist # Draw Cluster Center graphics Drawscatter (Plt,mat (centlist), size=60,color= ' Red ', mrkr= ' D ' ) Plt.show () #-*-Coding:utf-8-*-# filename:02kmeans1.py from numpy import * Import NumPy as NP from Recommand_li B Import * Import Matplotlib.pyplot as PLT # Data set constructed from File Datamat = File2matrix ("Testdata/4k2_far.txt", "\ T") DataSet = Mat (datamat[:,1:]) # Convert to matrix form k = 4 # number of classifiers m = shape (DataSet) [0] # Initialize the first cluster Center: The mean value of each column CENTROID0 = mean (DataSet, axis=0) . ToList () [0] Centlist =[CENTROID0] # To add the mean cluster center to the Center table # Initialize the cluster distance table, distance variance: Clustdist = Mat (Zeros ((m,2))) for J in Range (m): Cl ustdist[j,1] = Disteclud (centroid0,dataset[j,:]) **2 # The K Cluster Center while (LEN (centlist) < K) is generated sequentially: Lowestsse = inf # initializes the minimum error square sum.
Core parameters, the smaller the value of the clustering effect is better. # traversing each vector of Cenlist #----1. Use Clustdist to compute Lowestsse to determine: Bestcenttosplit, bestnewcents, bestclustass----# for I in Xrange (Len (centlist)): pts Incurrcluster = Dataset[nonzero (clustdist[:,0]. A==i) [0],:] # using the standard Kmeans algorithm (k=2), the Ptsincurrcluster is divided into two cluster centers, and the corresponding cluster distance table Centroidmat,splitclustass = Kmeans ( Ptsincurrcluster, 2) # calculates Splitclustass's distance squared and ssesplit = SUM (splitclustass[:,1)) # Calculates Clustdist[clus Tdist 1th column!=i the distance squared and ssenotsplit = SUM (Clustdist[nonzero (clustdist[:,0). a!=i) [0],1]) if (Ssesplit + ssenotsplit) < lowestsse: # algorithm formula: Lowestsse = ssesplit + ssenotsplit be
Stcenttosplit = i # to determine the optimal dividing point of a cluster center bestnewcents = centroidmat # Update the optimal cluster center with the new cluster center
Bestclustass = Splitclustass.copy () # Deep copy clustering distance table for optimal cluster distance table Lowestsse = ssesplit + ssenotsplit # update Lowestsse #Back to the outer loop #----2. Calculate the new clustdist----# # COMPUTE Bestclustass divided into two parts: # The first part is bestclustass[bindx0,0] The index of the cluster Center is assigned Bestclustass[nonzero (b estclustass[:,0]. A = = 1) [0],0] = Len (centlist) # The second section is indexed Bestclustass[nonzero (bestclustass[:,0) with the specified cluster center of the optimal dividing point.
A = = 0) [0],0] = bestcenttosplit # above for calculating Bestclustass # update clustdist the distance corresponding to the optimal dividing point, so that the distance value equals the value of the optimal cluster distance #以上为计算ClustDist #----3. Reconstructing the cluster center with the best dividing point----# # Overlay: bestnewcents[0,:].tolist () [0] Append to the original cluster center Bestcenttosplit position # Add: Cluster center adds a new bestnewcents[1: ].tolist () [0] vector centlist[bestcenttosplit] = bestnewcents[0,:].tolist () [0] Centlist.append (bestnewcents[1,:].tolist () [0]) Clustdist[nonzero (clustdist[:,0]. A = = Bestcenttosplit) [0],:]= Bestclustass # above for calculating centlist color_cluster (clustdist[:,0:1],dataset,plt) print "CenList: ", Mat (centlist) # print" Clustdist: ", Clustdist # Draw Cluster Center graphics Drawscatter (Plt,mat (centlist), size=60,color= ' Red ', mrkr= ' D '
) Plt.show ()
Algorithm Run Result:
(4) Computing similarity of SVD (semantic model: singular value decomposition)
#-*-Coding:utf-8-*-# Filename:svdRec2.py ' Created on the Mar 8, @author: Peter ' from numpy import * from n Umpy Import Linalg as La def loadexdata (): return[[0, 0, 0, 2, 2], [0, 0, 0, 3, 3], [0, 0, 0, 1 , 1], [1, 1, 1, 0, 0], [2, 2, 2, 0, 0], [5, 5, 5, 0, 0], [1, 1, 1, 0, 0]] def Loadredata (): return[[4, 4, 0, 2, 2], [4, 0, 0, 3, 3], [4, 0, 0, 1, 1], [1, 1, 1, 0,
0], [2, 2, 2, 0, 0], [5, 5, 5, 0, 0], [1, 1, 1, 0]] def loadExData2 (): return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5], [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3], [0, 0, 0, 0, 4, 0, 0 , 1, 0, 4, 0], [3, 3, 4, 0, 0, 0, 0, 2, 2, 0], [0, 5, 4, 5, 0, 0, 0, 0, 5, 5, 0], [0,
0, 0, 0, 5, 0, 1, 0, 0, 5, 0], [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1], [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4], [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2], [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0], [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]] # Euclidean distance : # Euclidean distance formula for two-dimensional space: sqrt ((x1-x2) ^2+ (y1-y2) ^2) def Ecludsim (INA,INB): Return 1.0/(1.0 + la.norm (INA-INB)) # Pearson similarity: CORRC
OEF correlation coefficient: the degree of linear correlation between x and Y, the greater the absolute value, the higher the X-y correlation. # E ((X-ex) (Y-ey))/sqrt (d (X) d (Y)) def Pearssim (INA,INB): If Len (InA) < 3:return 1.0 return 0.5+0.5*corrcoef (in A, InB, Rowvar = 0) [0][1] # angle cosine: Compute the angle between two points in the space cosine # Two n-dimensional sample point A (x11,x12,..., x1n) and B (x21,x22,..., x2n) Angle cosine # cos (theta) = a*b/(|a|*|
b|) def cossim (ina,inb): num = float (ina.t*inb) denom = La.norm (InA) *la.norm (InB) return 0.5+0.5* (num/denom) # Standard User estimates for similarity calculation def standest (datamat, user, Simmeas, item): n = shape (Datamat) [1] # Number of columns simtotal = 0.0; Ratsimtotal = 0.0 for j in Range (n): userrating = datamat[user,j] # DataSet-User row, element value if userrating = = 0:continue # Skip not evaluated Item # Logical_and: Matrix-by-element run logic with, return value for each element of True,false # Datamat[:,item]. A>0: Part ITEElement # Datamat[:,j] in the M column greater than 0. A: the row subscript overlap = Nonzero of the element in the J column greater than 0 in element # Overlap:datamat[:,item],datamat[:,j] that is greater than 0 = Logical_and (data Mat[:,item]. A>0,DATAMAT[:,J]. a>0)) [0] # Calculates similarity if len (overlap) = = 0:similarity = 0 else:similarity = Simmeas (Datamat[overla
P,ITEM],DATAMAT[OVERLAP,J]) # Calculates the similarity of the overlap matrix the similarity of # print ' column%d and column%d is:%f '% (item, J, similarity)
# Cumulative Total similarity simtotal + = similarity # ratsimtotal = similarity * Element value ratsimtotal + = similarity * userrating
if simtotal = = 0:return 0 # If the total similarity is 0, return 0 # returns similarity * element value/Total similarity else: # print ' ratsimtotal: ', ratsimtotal # print "Simtotal:", Simtotal return ratsimtotal/simtotal #使用svd进行估计 def svdest (Datamat, user, Simmeas, Item): n = shape (Datamat) [1] simtotal = 0.0; ratsimtotal = 0.0 # SVD Similarity calculation of the core U,SIGMA,VT = LA.SVD (datamat) # Compute singular value decomposition of matrices # Sig4 = Mat (Eye (4) *sigma[:4]) # take SV The first 4 of the D eigenvalues constitute a diagonal matrix # xformedItems = datamat.t * U[:,:4] * sig4.i # Creates a transformed project matrix create transformed Items V = VT. T # V is the similarity matrix of the Datamat xformeditems = v[:,:4] # print ' Xformeditems: ', Xformeditems # Iterate through the dataset for the J in range (n ): Userrating = datamat[user,j] # The unrated user is 0 and is therefore not counted.
All others have value # print ' userrating: ', userrating # Skip Unrated items If userrating = 0 or J==item:continue # calculates the similarity between vectors using the specified calculation formula similarity = Simmeas (Xformeditems[item,:]. T,xformeditems[j,:].
T) # Similarity Calculation Formula # print "The similarity between%d columns and%d columns to be evaluated is:%f"% (item, J, similarity) Simtotal + = similarity # COMPUTE Cumulative total Similarity Ratsimtotal + = similarity * userrating # Ratsim = similarity * Project Evaluation value if Simtotal = 0:return 0 Else:return ratsimtot
Al/simtotal # The Main method of producing the recommended results # Simmeas: Cossim, Pearssim, Ecludsim # Estmethod values: Standest,svdest # User line subscript for evaluation in the users Project matrix # n=3 returns TOP 3 def recommend (Datamat, user, n=3, Simmeas=cossim, estmethod=svdest): Unrateditems = Nonzero (Datamat[user,: ]. a==0) [1] # Find an unrated item--------Project momentThe 0 value # print ' Unrateditems: ' For the user line in the array, Unrateditems # Unrateditems: An item not evaluated--the column subscript if Len for the user row in the project matrix corresponding to 0 values (unratedit EMS) = = 0:return "All items have been rated" # Initialization project integral data type, is a two-dimensional matrix # element 1:item 2: Score Value itemscores = [] # loop to evaluate: Each item not evaluated Calculated similarity # in the evaluated comparison # in this case, the item is not evaluated 1,2 for item in Unrateditems:estimatedscore = Estmethod (datamat, user, Simmeas, Item) # Use the evaluation method to evaluate the data, return the evaluation Points itemscores.append ((item, Estimatedscore)) # and add the project and the corresponding evaluation points within the project integral # Return the sorted items and points, n=3 return Top 3 return sorted (Itemscores, Key=lambda jj:jj[1], reverse=true) [: N] # output Matrix def printmat (Inmat, thresh=0.8): fo
R i in range: for K in range: if float (inmat[i,k]) > Thresh:print 1, Else:print 0, print ' # picture compression def imgcompress (numsv=3, thresh=0.8,flag=true): Myl = [] for line I
N Open (' 0_5.txt '). ReadLines (): NewRow = [] for i in range: newrow.append (int (line[i))
Myl.append (NewRow) Mymat = Mat (myl) print "****original matrix******" Printmat (Mymat, thresh) u,sigma,vt = LA.SVD (Mymat)
Print "U row number:", Shape (U) [0], ",", Shape (U) [1] print "sigma:", sigma print "VT row number:", Shape (VT) [0], ",", Shape (VT) [1]
If Flag:sigrecon = Mat (Zeros (NUMSV, numsv)) for K in range (NUMSV): #construct the diagonal matrix from vector SIGRECON[K,K] = sigma[k] Reconmat = u[:,:numsv]*sigrecon*vt[:numsv,:] Print ****reconstructed matrix using%d Singular values****** "% numsv Printmat (Reconmat, Thresh)
#-*-Coding:utf-8-*-
# Filename:testRecomm01.py from
numpy import *
import NumPy as NP
import Operato R from
svdrec import *
import Matplotlib.pyplot as plt
eps = 1.0e-6
# Load corrected data
A = Mat ([[5, 5, 3, 0, 5, 5],[5, 0, 4, 0, 4, 4],[0, 3, 0, 5, 4, 5],[5, 4, 3, 3, 5, 5]]
# # by hand, SVD
U = a*a.t
Lamda,hu = Linalg.eig (U) # hu:u eigenvector
VT = a.t*a
ev,hvt = Linalg.eig (VT) # hvt:vt feature vector
HV = hvt.t
# print "hu:", Hu
# print "HV:", HV
sigma = sqrt (lamda) # eigenvalue
print "sigma:", sigma
print "SVD: Validation results:"
Sigma = Np.zeros ([shape (a) [0], shape (a) [1]])
U,S,VT = LINALG.SVD (a)
# Sigma[:shape (a) [0],: shape (a) [0]] = Np.diag (s)
# print U
print S
# print VT
# print U*SIGMA*VT
The results of the operation are as follows:
This chapter mainly introduces the Kmeans-unsupervised clustering algorithm and its Python implementation, the other one explains the most core algorithm-SVD algorithm, and finally uses NumPy library function to implement the SVD algorithm.