The original data is as follows:
U1 a,d,b,cu2 a,a,cu3 b,du4 a,d,cu5 a,b,c
Calculation formula using: Sim = U (i) ∩u (j)/(U (i) ∪u (j))
Wherein: (U (i) ∪u (j)) = U (i) + U (j)-U (i) ∩u (j)
The original Hadoop implementation requires 5 rounds of Mr, which can be completed with only two rounds of optimization.
The number of rounds before is mainly due to the calculation (U (i) ∪u (j)), the need to change key multiple times, not a large amount of computation. Just modify the passed key, and you can do it in two rounds.
mapper_1.py
#!/usr/bin/python#-*-coding:utf-8-*-import sysfor Line in Sys.stdin: user,item_str = Line.strip (). Split () Item_list = sorted (List (Set (Item_str.split (')))) print "Item_str:", Item_str, "Item_list:", item_list for i In range (len (item_list)): i1 = item_list[i] print i1,1, ' norm ' for i2 in item_list[i+1:]: print i1,i2,1 , ' Dot '
reducer_1.py
#!/usr/bin/python#-*-coding:utf-8-*-import sysdef PrintOut (): I1 = old_key print i1,old_dict[' norm '], ' norm ' for I 2 in old_dict[' dot ': print I1 + "-" + i2,old_dict[' dot '][i2],old_dict[' norm '], ' dot-norm_i1 ' Old_key = "" Old_dict = {' Norm ': 0, ' dot ': {}}for line in sys.stdin:sp = Line.strip (). Split () if sp[-1] = = ' norm ': Key,value = Sp[:2] If key = = old_key:old_dict[' norm '] + = Int (value) else:if Old_key! = "": PrintOut () Old_key = key # Notice:norm part should is int (value) old_dict = {' Norm ': int (value), ' dot ': {}} elif sp[-1] = = ' dot ': Key,i2,value = sp[:3] if key = = Old_key:if I2 not In old_dict[' dot ']: old_dict[' dot '][i2] = 0 old_dict[' dot '][i2] + = Int (value) Else: If Old_dot_key! = "": PrintOut () Old_key = key Old_dict = {' Norm ': int (value), ' Dot ': {}} if Old_kEY! = "": PrintOut ()
mapper_2.py
#!/usr/bin/python#-*-coding:utf-8-*-import sysfor Line in Sys.stdin: sp = Line.strip (). Split () if sp[-1] = = ' Norm ': print Line.strip () elif sp[-1] = = "DOT-NORM_I1": key,dot,norm_i1 = sp[:3] i1,i2 = key.split ('-') ) print i2,i1,dot,norm_i1, ' dot-norm_i1 '
reducer_2.py
#!/usr/bin/python#-*-coding:utf-8-*-import sysdef Gensim (norm_i1,norm_i2,dot): return float (dot)/(int (NORM_I1) + int) (NORM_I2)-int (dot)) def PrintOut (): i2 = Old_key norm_i2 = old_dict[' norm '] for i1 in old_dict[' dot ']: do T,NORM_I1 = old_dict[' dot '][i1] sim = Gensim (norm_i1,norm_i2,dot) Print i1+ "-" +i2,dot,norm_i1,norm_i2,sim, ' d Ot,norm_i1,norm_i2,sim ' Old_key = "" old_dict = {' Norm ': "", ' dot ': {}}for line in sys.stdin:sp = Line.strip (). Split () I F sp[-1] = = ' norm ': Key,value = sp[:2] if key = = old_key:old_dict[' norm '] = value else: If Old_key! = "": PrintOut () Old_key = key Old_dict = {' Norm ': value, ' dot ': { }} elif Sp[-1] = = ' DOT-NORM_I1 ': key,i1,dot,norm_i1 = Sp[:4] #key is i2. If key = = Old_key:if I1 not in old_dict[' dot ': old_dict[' dot '][i1] = (DOT,NORM_I1) Else:if Old_key! = "": PrintOut () Old_key = key Old_dict = {' Norm ': value, ' dot ': {i1: (DOT,NORM_I1)}}if old_key! = " ": PrintOut ()
Execute script t.sh:
#!/bin/bashcat User_log.txt |. /mapper_1.py |sort-k1 > D.m.1cat d.m.1 |. /reducer_1.py > D.r.1cat D.r.1 |. /mapper_2.py |sort-k1 > D.m.2cat d.m.2 |. /reducer_2.py > D.r.2
ITEMCF's Hadoop implementation optimizations (Python)