DM Experiment, write the comparison two, a lot of circumstances have not been considered, the subsequent time will be modified.
At the beginning of the data structure is not designed to cause writing to the back of a great effort, but fortunately the list of Python has the effect of the Resurrection 、、、
Data set: Database.txt
I1,i2,i5i2,i4i2,i3i1,i2,i4i1,i3i2,i3i1,i3i1,i2,i3,i5i1,i2,i3
apriori.py
#coding =utf-8 "" "Author:messiandzcy apriori.py date:2014.12.3" "" #申请存数据库的矩阵 to facilitate later traversal of Def matrix (num_of_transactions,n Um_of_items): Mat = [[' # ' for Y in range (num_of_items+1)] for x in range (num_of_transactions+1)] Retur n mat# Output Database matrix Debug def printf (mat,rows,cols): For I in range (rows): For j in Range (cols): Print Mat[i][j] , print #读入文件, save the database to the list data, and format the output Def ReadFile (): filename = R ' database.txt ' TRY:FP = Open (fil ename, "R") print "Reading File '%s ' ..."% filename print "%-10s%-10s"% ("TID", "Items") pos = 1 # POS Record transactions (number of rows) Max_j = 0 #记录项的最大宽度 (number of columns) data = Matrix (15,10) #最多15个事务, 10 items for lines in fp:st Ring=line.strip ("\ n") print "%-10d%-10s"% (pos,string) j = 1 #记录项的宽度 (number of columns) for items in String.Split (","): Data[pos][j]=items #向数据库插入数据 J + = 1 if j>max_j:max_j=j pos + = 1Fp.close () #print "pos=%d,j=%d"% (pos,max_j) #printf (data,pos,max_j) print "Read File success!\n" Return (Data,pos,max_j) except Ioerror:print "Read file--and '%s ' failed!"% filename print "file --'%s ' does not exist! "% filename except: #other exceptions print" Other exceptions! " #将数据库转换成垂直格式的, the count will be handy def vertical (mat,rows,cols): #扫描一遍数据库并去掉重复的项 lst = [] #项与整数的一一对应 for I in range (rows): For j in Range (cols): Lst.append (Mat[i][j]) lst=list (set (LST)) #去重 #print len (LST) #再扫描一遍数据库, creating a new Vertical Database I = [[],[],[],[],[],[],[],[],[],[]] #最多支持10个项, each item corresponds to a list for I in range (rows): A for J in range (col s): If mat[i][j]!= "#": I[lst.index (Mat[i][j])].append (i) #print lst #print I return (lst,i) #根据项集列表扫描数据库 and returns the support Count Def count (itemset,lst,i): If Len (itemset) ==1:return len (I[lst.index (Itemset[0])]) #1-the case of the itemsets x = itemset[0] GGG =i[lst.index (x)] #获得首元素对应的数据库I的List #求多个集合的交 for y in itemset:ggg = List (set (GGG) &set (I[lst.index (y))) #print GGG return len (GGG) #最后交集元素的个数即是支持度计数 #运行apriori算法,lst--> Record the list of items to be re-,i--> record vertical database with LST corresponding index def apriori (lst,i,min_sup): Print "Start to Run apriori!" #print lst #print I d=[[] for I in range (a) #假定每次自连接产生的项集个数不超过15 num = 0 #记录项集个数 for x in Lst:if x!= ' # ': D[num].append (x) #构造初始项集d, first more special, from the second start to construct num + = 1 #print count (["I1", "I2"],ls by self-join) T,i) #格式化输出初始候选项C1, processed separately, not in loop print "\ncandidate: #1" print "%-10s%-10s"% ("Items", "Count") for I in range (15): #i是项集的序号 if d[i]!=[]: print "%-10s%-10d"% ("". Join (D[i]), COUNT (d[i],lst,i)) #print D Iters=1 #迭代 Number of While iters<=7: #开始迭代, first step, ck-->lk, pruning print "\n#l%d"%iters print "%-10s%-10 according to the minimum threshold min_sup S "% (" Items "," count ") for I in range: If Len (D[i]) ==iters: #在频繁k项集中查找 if Count (d[I],lst,i] >=min_sup:print "%-10s%-10d"% (",". Join (D[i]), COUNT (d[i],lst,i)) else:d[i ]=[] #否则剪枝 #print d #迭代第二步, self-connected lk-->ck+1 new = [] for i in D:if Len (i) ==it Ers:new.append (i) #把频繁k项集Lk先挑出来, sequentially in the list new #print "new=" #print new If Len (new) ==0:break #若已经找不到符合条 Set of frequent K itemsets, terminate the loop! If iters==1: #第一次执行完全连接操作 num = 0 for i in range (len (new)): #i, J is subscript ordinal for J I N Range (I+1,len (new)): D[num]=[new[i][0],new[j][0]] #自连接后更新项集列表d num + = 1 else: #否则执行真正的自连接操作 tmp = [] for i in range (len (new)): #对于每个项集 #print New I [0:iters-1] If NEW[I][0:ITERS-1] not in tmp: #去重 tmp.append (new[i][0:iters-1]) #把每个 The first k-1 items of the itemsets are collected-->tmp #print "tmp=" #print tmp num = 0 for J in range (Len (TMP)): #对于每个前k-1 items hehe = [] for i in range (len (new)): #扫描项集, extracts the set of entries that are the same as the prefix and the previous k-1 items, and the latter part--& Gt;hehe if Tmp[j]==new[i][0:iters-1]:hehe.append (New[i][iters-1]) #得到hehe之后, start making a full connection #print "hehe=" #print hehe for M in range (len (hehe)): #i, J is subscript ordinal For n in range (M+1,len (hehe)): #print Tmp[j]+[hehe[m],hehe[n]] d[num] =TMP[J]+[HEHE[M],HEHE[N]] #自连接后更新项集列表d num + = 1 #print "d=" #print D #return print "\ncandidate:#%d"% (iters+1) print "%-10s%-10s"% ("Items", "Count") For I in range: #i是项集的序号 if Len (D[i]) ==iters+1:print "%-10s%-10d"% (",". Join (D[i]), COUNT ( d[i],lst,i)) #print d iters + = # main function (Data,pos,max_j) =readfile () #printf (Data,pos,max_j) #print "pos=%d,j=%d"% (Pos,max_j) (lst,i) =vertical (data,pos,max_j) #转换 #print lst#print verticalmin_sup = 2 #设置最小支持度apriori (lst,i,min_sup)
-apriori algorithm for frequent pattern mining