大資料演算法設計模式(1)

大資料演算法設計模式(1) - topN spark實現

最後更新：2017-08-27 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

標籤：value 函數 set 計算 bsp int contex split 廣播

topN演算法，spark實現

package com.kangaroo.studio.algorithms.topn;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.FlatMapFunction;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.broadcast.Broadcast;import scala.Tuple2;import java.io.Serializable;import java.util.*;public class TopnSpark implements Serializable {    private JavaSparkContext jsc;    Broadcast<Integer> topNum;    private String inputPath;    /*    *   建構函式    *   1. 初始化JavaSparkContext    *   2. 初始化廣播變數topN個數, 可以被所有partition共用    *   3. 初始化輸入路徑    * */    public TopnSpark(Integer Num, String path) {        jsc = new JavaSparkContext();        topNum = jsc.broadcast(Num);        inputPath = path;    }    /*    *   程式入口函數    * */    public void run() {        /*        *   讀入inputPath中的資料        * */        JavaRDD<String> lines = jsc.textFile(inputPath, 1);        /*        *   將rdd規約到9個分區        * */        JavaRDD<String> rdd = lines.coalesce(9);        /*        *   將輸入轉化為kv格式        *   key是規約的主鍵, value是排序參考的個數        *   注: 這裡的key並不唯一, 即相同的key可能有多條記錄, 所以下面我們規約key成唯一鍵        *   輸入:line, 輸出:kv        * */        JavaPairRDD<String, Integer> kv = rdd.mapToPair(new PairFunction<String, String, Integer>() {            public Tuple2<String, Integer> call(String s) throws Exception {                String[] tokens = s.split(",");                return new Tuple2<String, Integer>(tokens[0], Integer.parseInt(tokens[1]));            }        });        /*        *   規約主鍵成為唯一鍵        *   輸入:kv, 輸出:kv        * */        JavaPairRDD<String, Integer> uniqueKeys = kv.reduceByKey(new Function2<Integer, Integer, Integer>() {            public Integer call(Integer i1, Integer i2) throws Exception {                return i1 + i2;            }        });        /*        *   計算各個分區的topN        *   這裡通過廣播變數拿到了topN具體個數, 每個分區都保留topN, 所有分區總個數: partitionNum * topN        *   輸入:kv, 輸出:SortMap, 長度topN        * */        JavaRDD<SortedMap<Integer, String>> partitions = uniqueKeys.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String,Integer>>, SortedMap<Integer, String>>() {            public Iterable<SortedMap<Integer, String>> call(Iterator<Tuple2<String, Integer>> iter) throws Exception {                final int N = topNum.getValue();                SortedMap<Integer, String> topN = new TreeMap<Integer, String>();                while (iter.hasNext()) {                    Tuple2<String, Integer> tuple = iter.next();                    topN.put(tuple._2, tuple._1);                    if (topN.size() > N) {                        topN.remove(topN.firstKey());                    }                }                return Collections.singletonList(topN);            }        });        /*        *   規約所有分區的topN SortMap, 得到最終的SortMap, 長度topN        *   reduce過後, 資料已經到了本機快取, 這是最後結果        *   輸入: SortMap, 長度topN, 當然有partitionNum個, 輸出:SortMap, 長度topN        * */        SortedMap<Integer, String> finalTopN = partitions.reduce(new Function2<SortedMap<Integer, String>, SortedMap<Integer, String>, SortedMap<Integer, String>>() {            public SortedMap<Integer, String> call(SortedMap<Integer, String> m1, SortedMap<Integer, String> m2) throws Exception {                final int N = topNum.getValue();                SortedMap<Integer, String> topN = new TreeMap<Integer, String>();                for (Map.Entry<Integer, String> entry : m1.entrySet()) {                    topN.put(entry.getKey(), entry.getValue());                    if (topN.size() > N) {                        topN.remove(topN.firstKey());                    }                }                for (Map.Entry<Integer, String> entry : m2.entrySet()) {                    topN.put(entry.getKey(), entry.getValue());                    if (topN.size() > N) {                        topN.remove(topN.firstKey());                    }                }                return topN;            }        });        /*        *   將本機快取的最終結果列印出來        * */        for (Map.Entry<Integer, String> entry : finalTopN.entrySet()) {            System.out.println(entry.getKey() + " -- " + entry.getValue());        }    }    public static void main(String[] args) {        String inputPath = args[0];        TopnSpark topnMapper = new TopnSpark(10, inputPath);        topnMapper.run();    }}

大資料演算法設計模式(1) - topN spark實現

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More