The Apriori algorithm is an easy-to-understand, logic-simple, code-easy-to-write algorithm for finding frequent itemsets of large data.
Set the minimum support count to 3 that is, the number is greater than or equal to 3 is the frequent item
The original database count to get figure 2--the number of each thing is a frequent one of Figure 3
Fig. 1 Fig. 2 Fig. 3
Get candidate set {A,b},{a,e},{b,e} by frequent one item set 22 pairing
Scan the original database, which is figure 1
{a,b}2, {a,e}3, {b,e}1}, so get frequent two itemsets {a,e}. Since there is only one left in the two itemsets, it is impossible to generate three itemsets, so the program ends.
Look at the program flowchart as shown:
Figure 5 (image from the Web for learning reference only)
C + + code: Full description and Big Data test data can be downloaded from here http://download.csdn.net/download/my_acm/8042909
This program for simple programming use directly to the raw data into memory, so can not handle very large data, and the real Apriori algorithm processing ultra-large-scale data will be over and over again from the database read data. This is one of the reasons why the Apriori algorithm is not efficient.
/* Function:apriori algorithm finds frequent sets Authour:date:code::blocks 12.11 run through */#include <vector> #include <iostrea m> #include <cstring> #include <stdio.h> #include <algorithm> #include <math.h> #include < map>using namespace std; #define MAXN 100000#define maxitem 90#define suport 0.01//minimum support struct item{vector < Int> num;}; typedef vector <int> I_list;class apriori{public:int suport_num; Minimum support degree count int num_max; Single line maximum number int num_sol; Number of trade bars int num_fre; Number of frequent item kinds item ITEM[MAXN]; Initial trading information vector <i_list> FRE_ITEM[MAXN]; Save all frequent item information, Fre_item[i] indicates frequent i information map <int,int> List_item; Mark the total number of one item ordinal public:void init () {num_fre=0; num_max=0; suport_num=0; num_sol=0; } void Input () {//input, initialize trading information char str[3000]; int t=0; while (gets (str)) {//per-row read-in data char *f; f=Strtok (str, ""); while (f) {int x; X=atoi (f); list_item[x]++; Commodity count Item[t].num.push_back (x); F=strtok (NULL, ""); } if (Item[t].num.size () >num_max) num_max=item[t].num.size (); t++; } num_sol=t; for (int i=0;i<num_sol;i++) {sort (Item[i].num.begin (), Item[i].num.end ()); Trading item number from small to large arrangement} Suport_num=ceil (Suport*num_sol); Minimum support count cout<< "data total rows:" <<num_sol<< "minimum Support degree" <<suport<< "minimum support count:" <<suport_num& lt;<endl;//for (int i=0;i<num_sol;i++) {//int len=item[i].num.size (),//for (int J=0;J&L t;len;j++) {//printf ("%d", item[i].num[j]);//}//printf ("\ n");/}} VO ID output () {//Output frequent itemsets if (!fre_item[1].size ()) {printf ("No frequent item! \ n "); return; } for (int k=1;k<=num_fre;k++) {printf ("%d frequent item is:\n", K); for (int i=0;i<fre_item[k].size (), i++) {for (int j=0;j<fre_item[k][i].size (); j + +) {P rintf ("%d", fre_item[k][i][j]); } printf ("\ n"); } printf ("\ n"); }} void LCS (I_list &tmp,i_list &t1,i_list &t2) {//matches the longest common subsequence int len=t1.size (); int Sucess=1; for (int i=0;i<len-1;i++) {if (T1[i]!=t2[i]) {sucess=0; Break }} if (T1[len-1]==t2[len-1]) sucess=0; if (sucess) {tmp=t1; Tmp.push_back (t2[len-1]); }} int judge (I_list tmp) {//Determine if TMP is a frequent item int len=tmp.size (); int sum=0; int Sucess=1; for (int i=0;i<num_sol;i++) {//Traverse all raw data sucess=1; if (item[i].num.siZe () <len) continue; for (int k=0;k<len;k++) {int J; int tlen=item[i].num.size (); for (j=0;j<tlen;j++) {if (tmp[k]==item[i].num[j]) break; } if (J>=tlen) {sucess=0; Break }} if (sucess) sum++; } if (Sum>=suport_num)//greater than or equal to minimum support is a frequent item return of 1; return 0; Not frequent items} void Find_fre_item () {//Find all frequent items for (map <int,int>::iterator it=list_item.begin (); it! =list_item.end (); it++) {//Find all the frequent one if (it->second>=suport_num) {i_list tmp; Tmp.push_back (It->first); Fre_item[1].push_back (TMP); }} list_item.clear (); Release Lisg_item if (0==fre_item[1].size ()) return; if (1==fre_item[1].size ()) {num_fre=1; return; } int Len=num_max; for (int k=2;k<=len;k++) {///Find all two items after the frequent itemsets//Enumerate all the items of the K-1 item data, perform LCS matching, get K frequent items if (fre _item[k-1].size () >=2) for (int i=0;i<fre_item[k-1].size (), i++) {for (int j=i+1;j<fre_item [K-1].size (); j + +) {i_list tmp; tmp.clear (); LCS (Tmp,fre_item[k-1][i],fre_item[k-1][j]); if (!tmp.size ()) continue; if (judge (TMP)) {fre_item[k].push_back (TMP); Num_fre=k; }}}}} void Calculate_fre_item () {//Frequent item calculation init (); Input (); Find_fre_item (); Output (); }}sol;int Main () {freopen ("Retail.dat", "R", stdin); Freopen ("Result.txt", "w", stdout); Sol.calculate_fre_item (); return 0;}
Apriori algorithm and its code