資料採礦--頻繁集測試--Apriori演算法--java實現

來源:互聯網
上載者:User

2013年11月19日註:以下演算法中,combine演算法實現不正確,應該是從已有的頻繁中來產生。需要進一步修改

=================================================================================

Apriori演算法原理:

如果某個項集是頻繁的,那麼它所有的子集也是頻繁的。如果一個項集是非頻繁的,那麼它所有的超集也是非頻繁的。

示意圖

圖一:

圖二:


package cn.ffr.frequent.apriori;import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.URL;import java.util.ArrayList;import java.util.HashMap;import java.util.HashSet;import java.util.List;import java.util.Map;import java.util.Set;/** * Apriori的核心代碼實現 * @author neu_fufengrui@163.com */public class Apriori {public static final String STRING_SPLIT = ",";/** * 主要的計算方法 * @param data 資料集 * @param minSupport 最小支援度 * @param maxLoop 最大執行次數,設NULL為擷取最終結果 * @param containSet 結果中必須包含的子集 * @return */public Map<String, Double> compute(List<String[]> data, Double minSupport, Integer maxLoop, String[] containSet){//校正if(data == null || data.size() <= 0){return null;}//初始化Map<String, Double> result = new HashMap<String, Double>();Object[] itemSet = getDataUnitSet(data);int loop = 0;//核心迴圈處理過程while(true){//重要步驟一:合并,產生新的頻繁集Set<String> keys = combine(result.keySet(), itemSet);result.clear();//移除之前的結果for(String key : keys){result.put(key, computeSupport(data, key.split(STRING_SPLIT)));}//重要步驟二:修剪,去除支援度小於條件的。cut(result, minSupport, containSet);loop++;//輸出計算過程System.out.println("loop ["+loop+"], result : "+result);//迴圈結束條件if(result.size() <= 0){break;}if(maxLoop != null && maxLoop > 0 && loop >= maxLoop){//可控制迴圈執行次數break;}}return result;}/** * 計運算元集的支援度 *  * 支援度 = 子集在資料集中的資料項目 / 總的資料集的資料項目 *  * 資料項目的意思是一條資料。 * @param data 資料集 * @param subSet 子集  * @return */public Double computeSupport(List<String[]> data, String[] subSet){Integer value = 0;for(int i = 0; i < data.size(); i++){if(contain(data.get(i), subSet)){value ++;}}return value*1.0/data.size();}/** * 獲得初始化唯一的資料集,用於初始化 * @param data * @return */public Object[] getDataUnitSet(List<String[]> data){List<String> uniqueKeys = new ArrayList<String>();for(String[] dat : data){for(String da : dat){if(!uniqueKeys.contains(da)){uniqueKeys.add(da);}}}return uniqueKeys.toArray();}/** * 合并src和target來擷取頻繁集 * 增加頻繁集的計算維度 * @param src * @param target * @return */public Set<String> combine(Set<String> src, Object[] target){Set<String> dest = new HashSet<String>();if(src == null || src.size() <= 0){for(Object t : target){dest.add(t.toString());}return dest;}for(String s : src){for(Object t : target){if(s.indexOf(t.toString())<0){String key = s+STRING_SPLIT+t;if(!contain(dest, key)){dest.add(key);}}}}return dest;}/** * dest集中是否包含了key * @param dest * @param key * @return */public boolean contain(Set<String> dest, String key){for(String d : dest){if(equal(d.split(STRING_SPLIT), key.split(STRING_SPLIT))){return true;}}return false;}/** * 移除結果中,支援度小於所需要的支援度的結果。 * @param result * @param minSupport * @return */public Map<String, Double> cut(Map<String, Double> result, Double minSupport, String[] containSet){for(Object key : result.keySet().toArray()){//防止 java.util.ConcurrentModificationException,使用keySet().toArray()if(minSupport != null && minSupport > 0 && minSupport < 1 && result.get(key) < minSupport){//比較支援度result.remove(key);}if(containSet != null && containSet.length > 0 && !contain(key.toString().split(STRING_SPLIT), containSet)){result.remove(key);}}return result;}/** * src中是否包含dest,需要迴圈遍曆查詢 * @param src * @param dest * @return */public static boolean contain(String[] src, String[] dest){for(int i = 0; i < dest.length; i++){int j = 0;for(; j < src.length; j++){if(src[j].equals(dest[i])){break;}}if(j == src.length){return false;//can not find}}return true;}/** * src是否與dest相等 * @param src * @param dest * @return */public boolean equal(String[] src, String[] dest){if(src.length == dest.length && contain(src, dest)){return true;}return false;}/** * 主測試方法 * 測試方法,挨個去掉注釋,進行測試。 */public static void main(String[] args) throws Exception{//test 1//List<String[]> data = loadSmallData();//Long start = System.currentTimeMillis();//Map<String, Double> result = new Apriori().compute(data, 0.5, 3, null);//求支援度大於指定值//Long end = System.currentTimeMillis();//System.out.println("Apriori Result [costs:"+(end-start)+"ms]: ");//for(String key : result.keySet()){//System.out.println("\tFrequent Set=["+key+"] & Support=["+result.get(key)+"];");//}//test 2//List<String[]> data = loadMushRoomData();//Long start = System.currentTimeMillis();//Map<String, Double> result = new Apriori().compute(data, 0.3, 4, new String[]{"2"});//求支援度大於指定值//Long end = System.currentTimeMillis();//System.out.println("Apriori Result [costs:"+(end-start)+"ms]: ");//for(String key : result.keySet()){//System.out.println("\tFrequent Set=["+key+"] & Support=["+result.get(key)+"];");//}//test 3List<String[]> data = loadChessData();Long start = System.currentTimeMillis();Map<String, Double> result = new Apriori().compute(data, 0.95, 3, null);//求支援度大於指定值Long end = System.currentTimeMillis();System.out.println("Apriori Result [costs:"+(end-start)+"ms]: ");for(String key : result.keySet()){System.out.println("\tFrequent Set=["+key+"] & Support=["+result.get(key)+"];");}}/* *SmallData: minSupport 0.5, maxLoop 3, containSet null, [costs: 16ms] *MushRoomData: minSupport 0.3, maxLoop 4, containSet {"2"}, [costs: 103250ms] *ChessData: minSupport 0.95, maxLoop 34, containSet {null, [costs: 9718ms] *///測試資料集-1public static List<String[]> loadSmallData() throws Exception{List<String[]> data = new ArrayList<String[]>();data.add(new String[]{"d1","d3","d4"});data.add(new String[]{"d2","d3","d5"});data.add(new String[]{"d1","d2","d3","d5"});data.add(new String[]{"d2","d5"});return data;}//測試資料集-2public static List<String[]> loadMushRoomData() throws Exception{String link = "http://fimi.ua.ac.be/data/mushroom.dat";URL url = new URL(link);BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));String temp = reader.readLine();List<String[]> result = new ArrayList<String[]>();int lineNumber = 0;while(temp != null){System.out.println("reading data... [No."+(++lineNumber)+"]");String[] item = temp.split(" ");result.add(item);temp = reader.readLine();}reader.close();return result;}//測試資料集-3public static List<String[]> loadChessData() throws Exception{String link = "http://fimi.ua.ac.be/data/chess.dat";URL url = new URL(link);BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));String temp = reader.readLine();List<String[]> result = new ArrayList<String[]>();int lineNumber = 0;while(temp != null){System.out.println("reading data... [No."+(++lineNumber)+"]");String[] item = temp.split(" ");result.add(item);temp = reader.readLine();}reader.close();return result;}}


演算法原理:



相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.