Spark Data Statistics (Java Edition)

Source: Internet
Author: User
Tags getmessage
Java Data Statistics

Spark version 2.1.2, containing dateset use, sparkstreaming data statistics

Project address is https://github.com/baifanwudi/big-data-analysis code example sparksql Demo: Read json file write hive

Package com.adups.offline.hive.log;
Import COM.ADUPS.BASE.ABSTRACTSPARKSQL;
Import Com.adups.config.FlumePath;
Import Com.adups.util.DateUtil;
Import Org.apache.spark.sql.Dataset;
Import Org.apache.spark.sql.Row;
Import org.apache.spark.sql.SparkSession;
Import Org.apache.spark.sql.types.DataTypes;
Import Org.apache.spark.sql.types.StructField;
Import Org.apache.spark.sql.types.StructType;
Import Org.slf4j.Logger;

Import Org.slf4j.LoggerFactory;
Import java.io.IOException;
Import java.util.ArrayList;



Import java.util.List;

    public class Otaapplog extends Abstractsparksql {private Logger Logger = Loggerfactory.getlogger (Otaapplog.class);  @Override public void Executeprogram (String pt, string path, Sparksession Spark) throws IOException {int
        Partitionnum = 4;
        String ptwithpre= Dateutil.pathptwithpre (PT);
        String applogpath= Flumepath.app_log_path+ptwithpre;
        if (!existspath (Applogpath)) {return; } dataset<rOw> otaapplog= Spark.read (). Schema (Produceschema ()). JSON (Applogpath). Distinct (). Repartition (Partitionnum);
        Otaapplog.createorreplacetempview ("Otaapplog");
        Beforepartition (Spark); String sql = insert Overwrite table Ota_app_log partition (pt= ' +pt+ ') ' + ' select Mid,ip,version,devicei D,productid,continenten,continentzh,countryen,countryzh,provinceen,provincezh,cityen,cityzh, "+" networkty
        Pe,lac,cid,mcc,mnc,rxlev,num,gotype,createtime,datatype from Otaapplog ";
        Logger.warn ("Executing SQL is:" + sql);
    Spark.sql (SQL);
        Public Structtype Produceschema () {list<structfield> inputfields=new arraylist<> ();
        String splitseq= ","; String stringtype= "Mid,ip,version,continenten,continentzh,countryen,countryzh,provinceen,provincezh," + "C"
        Ityen,cityzh,networktype,deviceid,lac,cid,mcc,mnc,rxlev,datatype ";
        String timetype= "Createtime"; String longtype= "ProductId ";
        String integertype= "Num,gotype"; For (String stringTmp:stringType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield stringtmp,dataty Pes.
        Stringtype,true));
        } inputfields.add (Datatypes.createstructfield (Timetype,datatypes.timestamptype,false)); For (String integerTmp:integerType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield Integertmp,dat
        Atypes.integertype,true)); For (String longTmp:longType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield Longtmp,da
        Tatypes.longtype,false));
    Return Datatypes.createstructtype (Inputfields);
        public static void Main (string[] args) throws Exception {String pt= dateutil.produceptoryesterday (args);
        Otaapplog otaapplog =new otaapplog ();
    Otaapplog.runall (PT);

 }
}
Package com.adups.base;
Import Com.adups.config.HiveConfig;
Import Org.apache.hadoop.fs.Path;
Import org.apache.spark.sql.SparkSession;
Import Org.slf4j.Logger;
Import Org.slf4j.LoggerFactory;

Import java.io.IOException;
 /** * @author Allen * Created by Allen on 04/08/2017. * * Public abstract class Abstractsparksql extends Abstractfilesystem {private Logger Logger = Loggerfactory.getlogg

    ER (This.getclass ()); /** * Spark Operation * @param pt time Format pt=2017-10-11 * @param path HDFS path * @param spark * @throws Ioexce

    Ption */public abstract void Executeprogram (String pt,string path,sparksession Spark) throws IOException; public boolean Existspath (String ... pathlist) throws IOException {for (string path:pathlist) {if (!filesystem.exists (new Path))
                {Logger.error ("The path:" + path + "is not existed");
            return false; }else{Logger.warn ("Executing" path is: "+ path";
    } return true; public void Runall (String pt,string Path,boolean ishivesupport) throws IOException {if (Path!=null &&am
            P.!existspath (Path)) {logger.error ("The SRC path is not existed:" + path);
        Return
    } executespark (Pt,path,ishivesupport); /** * No path judgment, default activation hive */public void Runall (String pt) throws IOException {Runall (Pt,null,
    true);
    public void Runall (String pt,string path) throws IOException {Runall (pt,path,true);
    The public void Runall (String pt,boolean ishivesupport) throws IOException {Runall (pt,null,ishivesupport);  } private void Executespark (String pt,string Path,boolean ishivesupport) throws IOException {Sparksession
        Spark;
        String Appname=this.getclass (). Getsimplename (); if (ishivesupport) {spark = Sparksession.builder (). AppName (appName). enablehivesUpport (). Getorcreate ();
            Logger.info ("Spark enable hive, begin to execute");
        Usedatabase (Spark);
            }else{spark = Sparksession.builder (). AppName (AppName). Getorcreate ();
        Logger.info ("Spark begin to execute");
        } executeprogram (Pt,path,spark);
    Logger.info ("Spark has finished");
        } private void Usedatabase (Sparksession spark) {Logger.info ("before the sql:" +hiveconfig.sql_database);
    Spark.sql (hiveconfig.sql_database);
    public void Beforepartition (Sparksession spark) {spark.sql (hiveconfig.hive_partition);
 }
}
Spark read JSON file to do data statistics write to MySQL
Package com.adups.online.flume;
Import COM.ADUPS.BASE.ABSTRACTSPARKSQL;
Import Com.adups.bean.out.DeviceArea;
Import Com.adups.config.OnlineOfflinePath;
Import com.adups.common.ReadTable;
Import Com.adups.common.sql.flume.DeviceAreaOnlineSave;
Import Com.adups.util.CommonUtil;
Import Com.adups.util.DateUtil;
Import Org.apache.spark.sql.Dataset;
Import Org.apache.spark.sql.Row;
Import org.apache.spark.sql.SparkSession;
Import org.apache.spark.sql.functions;
Import Org.slf4j.Logger;
Import Org.slf4j.LoggerFactory;
Import Scala.collection.Seq;
Import static org.apache.spark.sql.functions.*;

Import java.io.IOException;
 /** * @author Allen * Created by Allen on 03/08/2017. * * public class Deviceareaonline extends Abstractsparksql {private Logger Logger = Loggerfactory.getlogger (this.getc

    Lass ()); @Override public void Executeprogram (String pt, string path, Sparksession Spark) throws IOException {string P
        Repath = Dateutil.pathptwithpre (PT); String NOWPT =Dateutil.nowptday ();
        String begintime = nowpt + "00:00:00";

        String Endtime = nowpt + "23:59:59";
        String devicetotal = Onlineofflinepath.offline_device_new_total_path + Prepath;
        String deviceareatotal = Onlineofflinepath.offline_device_area_new_total_path + Prepath;

        String Originareapath = Onlineofflinepath.online_device_area_new_total_path;
        if (!existspath (Deviceareatotal, deviceareatotal)) {return; String where = "(select product_id as productid,device_id as deviceid,country_zh as country,province_zh as Prov Ince from Iot_register.device_info "+" where create_time between ' "+ BeginTime +" ' and ' "+ Endtime +"

        ') as Device_time_filter ";
        dataset<row> todaydevice = new Readtable (). Loadtable (Spark, where). COALESCE (1);
        dataset<row> yesterdaystats = Spark.read (). Parquet (Devicetotal). Select ("ProductId", "Totalnum"); dataset<row> totalincrement = tOdaydevice.groupby ("ProductId"). Agg (Functions.countdistinct ("DeviceId"). As ("Newnum"));
        seq<string> seq = commonutil.columnnames ("ProductId");
        seq<string> Nafillzero = commonutil.columnnames ("Newnum,totalnum"); dataset<row> result = Yesterdaystats.join (totalincrement, SEQ, "outer"). NA (). Fill (0, nafillzero). SE Lect (Col ("ProductId"), col ("Newnum"), col ("Newnum"). Plus (col ("Totalnum")). As ("Totalnum")). Withcolumn ("PT"

        , lit (NOWPT)). COALESCE (1); dataset<row> yesterdayareastatistics = Spark.read (). Parquet (Deviceareatotal). Select ("ProductId", "Country", "

        Province "," Totalnum "). TODF (); dataset<row> areaincrement = Todaydevice.groupby ("ProductId", "Country", "province"). Agg (

        Functions.countdistinct ("DeviceId"). As ("Newnum"));
        Seq = commonutil.columnnames ("productid,country,province"); dataset<row> Arearesult = Yesterdayareastatistics.join (areaincrement, SEQ, "outer"). NA (). Fill (0, Nafillzero). Select (Col ("ProductId"), col ("Country"), col ("province"), col ("Newnum"),

        Col ("Newnum"). Plus (col ("Totalnum")). As ("Totalnum")). Withcolumn ("PT", Lit (NOWPT)). COALESCE (1);

        Dataset<devicearea> Deltaarea; if (Existspath (Originareapath)) {try {dataset<row> originbase = Spark.read (). Parquet (o
                Riginareapath);
            Deltaarea = Arearesult.except (originbase). COALESCE (1). As (new Devicearea (). Producebeanencoder ());
                catch (Exception e) {logger.error (E.getmessage ());
            Deltaarea = arearesult.as (new Devicearea (). Producebeanencoder ());
        } else {Deltaarea = arearesult.as (new Devicearea (). Producebeanencoder ());
        try {inserttomysql (Deltaarea);
        catch (Exception e) {logger.error (E.getmessage ()); } arearesult.write (). Mode ("Overwrite").Format ("Parquet"). Save (Originareapath);

    Result.write (). Mode ("Overwrite"). Format ("Parquet"). Save (Onlineofflinepath.online_device_new_total_path);
            } public void Inserttomysql (dataset<devicearea> Dataset) {dataset.foreachpartition (data-> {
                    String sql = "INSERT into Stats_device_area (product_id,country,province,new_num,total_num,pt)" + "VALUES (?,?,?,?,?,?)
            On duplicate key update new_num=?,total_num=? ";
        New Deviceareaonlinesave (). Putdatabatch (data, SQL);
    }); public static void Main (string[] args) throws IOException {String pt = dateutil.produceptoryesterday (args)
        ;
        Deviceareaonline deviceareaonline = new Deviceareaonline ();
    Deviceareaonline.runall (PT, false);
 }
}

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.