The code basically comes from the light < recommendation system practice, and the pseudo-code in the book to implement, but also refer to the https://www.douban.com/note/336280497/
You can also add a normalization of the user similarity, and the effect will be better.
The data set is 100,000 data of movielens.
Links: Moivelens
#Coding:utf-8ImportRandom,math fromoperatorImportItemgetterclassUserbasedcf:def __init__(self,traindatafile=none,testdatafile=none,splitor='\ t'): iftraindatafile!=None:self.train=self.loaddata (traindatafile, Splitor)iftestdatafile!=None:self.test=self.loaddata (testdatafile, splitor) Self.simimatrix={} defsetData (self,train,test): Self.train=Train Self.test=TestdefLoadData (self,datafile,splitor='\ t'): Data={} forLineinchOpen (datafile): user,item,record,_=line.split () data.setdefault (user,{}) data[user][item]=RecordreturnDatadefRecallandprecision (self,peerscount,topn=10): hit=0 Recall=0 Precision=0 forUserinchSelf.train.keys (): Itemofuser=self.test.get (user,{}) Recitems=self.recommend (user,peerscount,topn) forItem,puiinchRecitems.items ():ifIteminchItemofuser:hit+=1Recall+=Len (itemofuser) Precision+=TopN#print ' recall:%s hit:%s allratings:%s '% (hit/(recall*1.0), Hit,precision) return(hit/(recall * 1.0), Hit/(precision * 1.0)) defCoverage (self,peerscount,topn=10): Recommend_items=Set () all_items=Set () forUserinchSelf.train.keys (): forIteminchSelf.train[user].keys (): all_items.add (item) rank=self.recommend (user,peerscount,topn) forItem,puiinchrank.items (): recommend_items.add (item)returnLen (recommend_items)/(len (all_items) *1.0) defPopularity (self,peerscount,topn=10): item_popularity=dict () forUser,itemsinchSelf.train.items (): forIteminchItems.keys ():ifItem not inchitem_popularity:item_popularity[item]=1item_popularity[item]+=1ret=0 N=0 forUserinchSelf.train.keys (): Rank=self.recommend (user,peerscount,topn) forItem,puiinchRank.items (): ret+=math.log (1 +item_popularity[item]) N+=1returnret/(n*1.0) defcalusersimilarity (self): item_users=dict () forU,ratingsinchSelf.train.items (): forIinchratings.keys (): item_users.setdefault (i,set ()) item_users[i].add (u) #calculate co-rated items between usersCoratedcount=dict () Itemcountofuser=dict () forItem,usersinchItem_users.items (): forUinchUsers:itemCountOfUser.setdefault (u,0) itemcountofuser[u]+=1 forVinchusers:ifu==V:ContinueCoratedcount.setdefault (u,{}) Coratedcount[u].setdefault (v,0) coratedcount[u][v]+=1/math.log (1 +Len (USERS)) Usersimimatrix=dict () forU,related_usersinchcoratedcount.items (): usersimimatrix.setdefault (u,{}) forV,cuvinchrelated_users.items (): usersimimatrix[u][v]=cuv/math.sqrt (itemcountofuser[u]*itemcountofuser[v]) Self.simimatrix=UsersimimatrixdefRecommend (self,useru,peerscount,topn=10): Recitems=dict () Interacted_items=self.train[useru]" "prepare the user similarity matrix first" " if notself.simiMatrix:self.calUserSimilarity () forUserv,simiuvinchSorted (self.simimatrix[useru].items (), key=itemgetter (1), reverse=True) [0:peerscount]: forItem,ratingv4iinchSelf.train[userv].items ():ifIteminchinteracted_items:Continue ifItem not inchrecitems:recitems[item]=0 recitems[item]+=simiuv*float (ratingv4i)#Transform 4 stars into score 0.8 " "If Len (recitems) ==topn:return recitems" " returnDict (sorted (recitems.items (), key =LambdaX:x[1],reverse =True) [0:topn])defTESTUSERBASEDCF (): CF=USERBASEDCF (traindatafile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.base', Testdatafile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.test') #cf.calusersimilarity () Print("%3s%15s%15s%15s%15s"% ('K',"Precision",'Recall','Coverage','popularity')) forKinch[5,10,20,40,80,160]: recall,precision= Cf.recallandprecision (peerscount =K) Coverage= Cf.coverage (peerscount =K) Popularity= Cf.popularity (peerscount =k)Print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f"% (k,precision * 100,recall * 100,coverage * 100, Popularity))defSplitdata (wholedata,m,k,seed,splitor='\ t'): Test={} Train={} random.seed (seed) forLineinchWholedata:user,item,score,time=line.strip (). Split (splitor)ifRandom.randint (0,m) = =K:test.setdefault (user,{}) test[user][item]=scoreElse: Train.setdefault (user,{}) train[user][item]=scorereturntrain,testdeftestUserBasedCF2 (): Wholedata=open (r'E:\ResearchAndPapers\DataSet\ml-1m\ratings.dat') Train,test=splitdata (wholedata, 8, 5, splitor='::') CF=USERBASEDCF () cf.setdata (train, test)#cf=userbasedcf (traindatafile=r ' E:\ResearchAndPapers\DataSet\ml-100k\u5.base ', testdatafile=r ' e:\ Researchandpapers\dataset\ml-100k\u5.test ') #cf.calusersimilarity () Print("%3s%15s%15s%15s%15s"% ('K',"Precision",'Recall','Coverage','popularity')) forKinch[5,10,20,40,80,160]: recall,precision= Cf.recallandprecision (peerscount =K) Coverage= Cf.coverage (peerscount =K) Popularity= Cf.popularity (peerscount =k)Print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f"% (k,precision * 100,recall * 100,coverage * 100, Popularity)) if __name__=="__main__": TESTUSERBASEDCF ()#testUserBasedCF2 ()
Implementation of collaborative Filtering--python based on user similarity