Spark Data Statistics (Java Edition)

Last Update:2018-07-26 Source: Internet

Author: User

Tags getmessage

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Java Data Statistics

Spark version 2.1.2, containing dateset use, sparkstreaming data statistics

Project address is https://github.com/baifanwudi/big-data-analysis code example sparksql Demo: Read json file write hive

Package com.adups.offline.hive.log;
Import COM.ADUPS.BASE.ABSTRACTSPARKSQL;
Import Com.adups.config.FlumePath;
Import Com.adups.util.DateUtil;
Import Org.apache.spark.sql.Dataset;
Import Org.apache.spark.sql.Row;
Import org.apache.spark.sql.SparkSession;
Import Org.apache.spark.sql.types.DataTypes;
Import Org.apache.spark.sql.types.StructField;
Import Org.apache.spark.sql.types.StructType;
Import Org.slf4j.Logger;

Import Org.slf4j.LoggerFactory;
Import java.io.IOException;
Import java.util.ArrayList;



Import java.util.List;

    public class Otaapplog extends Abstractsparksql {private Logger Logger = Loggerfactory.getlogger (Otaapplog.class);  @Override public void Executeprogram (String pt, string path, Sparksession Spark) throws IOException {int
        Partitionnum = 4;
        String ptwithpre= Dateutil.pathptwithpre (PT);
        String applogpath= Flumepath.app_log_path+ptwithpre;
        if (!existspath (Applogpath)) {return; } dataset<rOw> otaapplog= Spark.read (). Schema (Produceschema ()). JSON (Applogpath). Distinct (). Repartition (Partitionnum);
        Otaapplog.createorreplacetempview ("Otaapplog");
        Beforepartition (Spark); String sql = insert Overwrite table Ota_app_log partition (pt= ' +pt+ ') ' + ' select Mid,ip,version,devicei D,productid,continenten,continentzh,countryen,countryzh,provinceen,provincezh,cityen,cityzh, "+" networkty
        Pe,lac,cid,mcc,mnc,rxlev,num,gotype,createtime,datatype from Otaapplog ";
        Logger.warn ("Executing SQL is:" + sql);
    Spark.sql (SQL);
        Public Structtype Produceschema () {list<structfield> inputfields=new arraylist<> ();
        String splitseq= ","; String stringtype= "Mid,ip,version,continenten,continentzh,countryen,countryzh,provinceen,provincezh," + "C"
        Ityen,cityzh,networktype,deviceid,lac,cid,mcc,mnc,rxlev,datatype ";
        String timetype= "Createtime"; String longtype= "ProductId ";
        String integertype= "Num,gotype"; For (String stringTmp:stringType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield stringtmp,dataty Pes.
        Stringtype,true));
        } inputfields.add (Datatypes.createstructfield (Timetype,datatypes.timestamptype,false)); For (String integerTmp:integerType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield Integertmp,dat
        Atypes.integertype,true)); For (String longTmp:longType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield Longtmp,da
        Tatypes.longtype,false));
    Return Datatypes.createstructtype (Inputfields);
        public static void Main (string[] args) throws Exception {String pt= dateutil.produceptoryesterday (args);
        Otaapplog otaapplog =new otaapplog ();
    Otaapplog.runall (PT);

 }
}

Package com.adups.base;
Import Com.adups.config.HiveConfig;
Import Org.apache.hadoop.fs.Path;
Import org.apache.spark.sql.SparkSession;
Import Org.slf4j.Logger;
Import Org.slf4j.LoggerFactory;

Import java.io.IOException;
 /** * @author Allen * Created by Allen on 04/08/2017. * * Public abstract class Abstractsparksql extends Abstractfilesystem {private Logger Logger = Loggerfactory.getlogg

    ER (This.getclass ()); /** * Spark Operation * @param pt time Format pt=2017-10-11 * @param path HDFS path * @param spark * @throws Ioexce

    Ption */public abstract void Executeprogram (String pt,string path,sparksession Spark) throws IOException; public boolean Existspath (String ... pathlist) throws IOException {for (string path:pathlist) {if (!filesystem.exists (new Path))
                {Logger.error ("The path:" + path + "is not existed");
            return false; }else{Logger.warn ("Executing" path is: "+ path";
    } return true; public void Runall (String pt,string Path,boolean ishivesupport) throws IOException {if (Path!=null &&am
            P.!existspath (Path)) {logger.error ("The SRC path is not existed:" + path);
        Return
    } executespark (Pt,path,ishivesupport); /** * No path judgment, default activation hive */public void Runall (String pt) throws IOException {Runall (Pt,null,
    true);
    public void Runall (String pt,string path) throws IOException {Runall (pt,path,true);
    The public void Runall (String pt,boolean ishivesupport) throws IOException {Runall (pt,null,ishivesupport);  } private void Executespark (String pt,string Path,boolean ishivesupport) throws IOException {Sparksession
        Spark;
        String Appname=this.getclass (). Getsimplename (); if (ishivesupport) {spark = Sparksession.builder (). AppName (appName). enablehivesUpport (). Getorcreate ();
            Logger.info ("Spark enable hive, begin to execute");
        Usedatabase (Spark);
            }else{spark = Sparksession.builder (). AppName (AppName). Getorcreate ();
        Logger.info ("Spark begin to execute");
        } executeprogram (Pt,path,spark);
    Logger.info ("Spark has finished");
        } private void Usedatabase (Sparksession spark) {Logger.info ("before the sql:" +hiveconfig.sql_database);
    Spark.sql (hiveconfig.sql_database);
    public void Beforepartition (Sparksession spark) {spark.sql (hiveconfig.hive_partition);
 }
}

Spark read JSON file to do data statistics write to MySQL

Package com.adups.online.flume;
Import COM.ADUPS.BASE.ABSTRACTSPARKSQL;
Import Com.adups.bean.out.DeviceArea;
Import Com.adups.config.OnlineOfflinePath;
Import com.adups.common.ReadTable;
Import Com.adups.common.sql.flume.DeviceAreaOnlineSave;
Import Com.adups.util.CommonUtil;
Import Com.adups.util.DateUtil;
Import Org.apache.spark.sql.Dataset;
Import Org.apache.spark.sql.Row;
Import org.apache.spark.sql.SparkSession;
Import org.apache.spark.sql.functions;
Import Org.slf4j.Logger;
Import Org.slf4j.LoggerFactory;
Import Scala.collection.Seq;
Import static org.apache.spark.sql.functions.*;

Import java.io.IOException;
 /** * @author Allen * Created by Allen on 03/08/2017. * * public class Deviceareaonline extends Abstractsparksql {private Logger Logger = Loggerfactory.getlogger (this.getc

    Lass ()); @Override public void Executeprogram (String pt, string path, Sparksession Spark) throws IOException {string P
        Repath = Dateutil.pathptwithpre (PT); String NOWPT =Dateutil.nowptday ();
        String begintime = nowpt + "00:00:00";

        String Endtime = nowpt + "23:59:59";
        String devicetotal = Onlineofflinepath.offline_device_new_total_path + Prepath;
        String deviceareatotal = Onlineofflinepath.offline_device_area_new_total_path + Prepath;

        String Originareapath = Onlineofflinepath.online_device_area_new_total_path;
        if (!existspath (Deviceareatotal, deviceareatotal)) {return; String where = "(select product_id as productid,device_id as deviceid,country_zh as country,province_zh as Prov Ince from Iot_register.device_info "+" where create_time between ' "+ BeginTime +" ' and ' "+ Endtime +"

        ') as Device_time_filter ";
        dataset<row> todaydevice = new Readtable (). Loadtable (Spark, where). COALESCE (1);
        dataset<row> yesterdaystats = Spark.read (). Parquet (Devicetotal). Select ("ProductId", "Totalnum"); dataset<row> totalincrement = tOdaydevice.groupby ("ProductId"). Agg (Functions.countdistinct ("DeviceId"). As ("Newnum"));
        seq<string> seq = commonutil.columnnames ("ProductId");
        seq<string> Nafillzero = commonutil.columnnames ("Newnum,totalnum"); dataset<row> result = Yesterdaystats.join (totalincrement, SEQ, "outer"). NA (). Fill (0, nafillzero). SE Lect (Col ("ProductId"), col ("Newnum"), col ("Newnum"). Plus (col ("Totalnum")). As ("Totalnum")). Withcolumn ("PT"

        , lit (NOWPT)). COALESCE (1); dataset<row> yesterdayareastatistics = Spark.read (). Parquet (Deviceareatotal). Select ("ProductId", "Country", "

        Province "," Totalnum "). TODF (); dataset<row> areaincrement = Todaydevice.groupby ("ProductId", "Country", "province"). Agg (

        Functions.countdistinct ("DeviceId"). As ("Newnum"));
        Seq = commonutil.columnnames ("productid,country,province"); dataset<row> Arearesult = Yesterdayareastatistics.join (areaincrement, SEQ, "outer"). NA (). Fill (0, Nafillzero). Select (Col ("ProductId"), col ("Country"), col ("province"), col ("Newnum"),

        Col ("Newnum"). Plus (col ("Totalnum")). As ("Totalnum")). Withcolumn ("PT", Lit (NOWPT)). COALESCE (1);

        Dataset<devicearea> Deltaarea; if (Existspath (Originareapath)) {try {dataset<row> originbase = Spark.read (). Parquet (o
                Riginareapath);
            Deltaarea = Arearesult.except (originbase). COALESCE (1). As (new Devicearea (). Producebeanencoder ());
                catch (Exception e) {logger.error (E.getmessage ());
            Deltaarea = arearesult.as (new Devicearea (). Producebeanencoder ());
        } else {Deltaarea = arearesult.as (new Devicearea (). Producebeanencoder ());
        try {inserttomysql (Deltaarea);
        catch (Exception e) {logger.error (E.getmessage ()); } arearesult.write (). Mode ("Overwrite").Format ("Parquet"). Save (Originareapath);

    Result.write (). Mode ("Overwrite"). Format ("Parquet"). Save (Onlineofflinepath.online_device_new_total_path);
            } public void Inserttomysql (dataset<devicearea> Dataset) {dataset.foreachpartition (data-> {
                    String sql = "INSERT into Stats_device_area (product_id,country,province,new_num,total_num,pt)" + "VALUES (?,?,?,?,?,?)
            On duplicate key update new_num=?,total_num=? ";
        New Deviceareaonlinesave (). Putdatabatch (data, SQL);
    }); public static void Main (string[] args) throws IOException {String pt = dateutil.produceptoryesterday (args)
        ;
        Deviceareaonline deviceareaonline = new Deviceareaonline ();
    Deviceareaonline.runall (PT, false);
 }
}

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More