Java Data Statistics
Spark version 2.1.2, containing dateset use, sparkstreaming data statistics
Project address is https://github.com/baifanwudi/big-data-analysis code example sparksql Demo: Read json file write hive
Package com.adups.offline.hive.log;
Import COM.ADUPS.BASE.ABSTRACTSPARKSQL;
Import Com.adups.config.FlumePath;
Import Com.adups.util.DateUtil;
Import Org.apache.spark.sql.Dataset;
Import Org.apache.spark.sql.Row;
Import org.apache.spark.sql.SparkSession;
Import Org.apache.spark.sql.types.DataTypes;
Import Org.apache.spark.sql.types.StructField;
Import Org.apache.spark.sql.types.StructType;
Import Org.slf4j.Logger;
Import Org.slf4j.LoggerFactory;
Import java.io.IOException;
Import java.util.ArrayList;
Import java.util.List;
public class Otaapplog extends Abstractsparksql {private Logger Logger = Loggerfactory.getlogger (Otaapplog.class); @Override public void Executeprogram (String pt, string path, Sparksession Spark) throws IOException {int
Partitionnum = 4;
String ptwithpre= Dateutil.pathptwithpre (PT);
String applogpath= Flumepath.app_log_path+ptwithpre;
if (!existspath (Applogpath)) {return; } dataset<rOw> otaapplog= Spark.read (). Schema (Produceschema ()). JSON (Applogpath). Distinct (). Repartition (Partitionnum);
Otaapplog.createorreplacetempview ("Otaapplog");
Beforepartition (Spark); String sql = insert Overwrite table Ota_app_log partition (pt= ' +pt+ ') ' + ' select Mid,ip,version,devicei D,productid,continenten,continentzh,countryen,countryzh,provinceen,provincezh,cityen,cityzh, "+" networkty
Pe,lac,cid,mcc,mnc,rxlev,num,gotype,createtime,datatype from Otaapplog ";
Logger.warn ("Executing SQL is:" + sql);
Spark.sql (SQL);
Public Structtype Produceschema () {list<structfield> inputfields=new arraylist<> ();
String splitseq= ","; String stringtype= "Mid,ip,version,continenten,continentzh,countryen,countryzh,provinceen,provincezh," + "C"
Ityen,cityzh,networktype,deviceid,lac,cid,mcc,mnc,rxlev,datatype ";
String timetype= "Createtime"; String longtype= "ProductId ";
String integertype= "Num,gotype"; For (String stringTmp:stringType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield stringtmp,dataty Pes.
Stringtype,true));
} inputfields.add (Datatypes.createstructfield (Timetype,datatypes.timestamptype,false)); For (String integerTmp:integerType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield Integertmp,dat
Atypes.integertype,true)); For (String longTmp:longType.split (splitseq)) {Inputfields.add (Datatypes.createstructfield Longtmp,da
Tatypes.longtype,false));
Return Datatypes.createstructtype (Inputfields);
public static void Main (string[] args) throws Exception {String pt= dateutil.produceptoryesterday (args);
Otaapplog otaapplog =new otaapplog ();
Otaapplog.runall (PT);
}
}
Package com.adups.base;
Import Com.adups.config.HiveConfig;
Import Org.apache.hadoop.fs.Path;
Import org.apache.spark.sql.SparkSession;
Import Org.slf4j.Logger;
Import Org.slf4j.LoggerFactory;
Import java.io.IOException;
/** * @author Allen * Created by Allen on 04/08/2017. * * Public abstract class Abstractsparksql extends Abstractfilesystem {private Logger Logger = Loggerfactory.getlogg
ER (This.getclass ()); /** * Spark Operation * @param pt time Format pt=2017-10-11 * @param path HDFS path * @param spark * @throws Ioexce
Ption */public abstract void Executeprogram (String pt,string path,sparksession Spark) throws IOException; public boolean Existspath (String ... pathlist) throws IOException {for (string path:pathlist) {if (!filesystem.exists (new Path))
{Logger.error ("The path:" + path + "is not existed");
return false; }else{Logger.warn ("Executing" path is: "+ path";
} return true; public void Runall (String pt,string Path,boolean ishivesupport) throws IOException {if (Path!=null &&am
P.!existspath (Path)) {logger.error ("The SRC path is not existed:" + path);
Return
} executespark (Pt,path,ishivesupport); /** * No path judgment, default activation hive */public void Runall (String pt) throws IOException {Runall (Pt,null,
true);
public void Runall (String pt,string path) throws IOException {Runall (pt,path,true);
The public void Runall (String pt,boolean ishivesupport) throws IOException {Runall (pt,null,ishivesupport); } private void Executespark (String pt,string Path,boolean ishivesupport) throws IOException {Sparksession
Spark;
String Appname=this.getclass (). Getsimplename (); if (ishivesupport) {spark = Sparksession.builder (). AppName (appName). enablehivesUpport (). Getorcreate ();
Logger.info ("Spark enable hive, begin to execute");
Usedatabase (Spark);
}else{spark = Sparksession.builder (). AppName (AppName). Getorcreate ();
Logger.info ("Spark begin to execute");
} executeprogram (Pt,path,spark);
Logger.info ("Spark has finished");
} private void Usedatabase (Sparksession spark) {Logger.info ("before the sql:" +hiveconfig.sql_database);
Spark.sql (hiveconfig.sql_database);
public void Beforepartition (Sparksession spark) {spark.sql (hiveconfig.hive_partition);
}
}
Spark read JSON file to do data statistics write to MySQL
Package com.adups.online.flume;
Import COM.ADUPS.BASE.ABSTRACTSPARKSQL;
Import Com.adups.bean.out.DeviceArea;
Import Com.adups.config.OnlineOfflinePath;
Import com.adups.common.ReadTable;
Import Com.adups.common.sql.flume.DeviceAreaOnlineSave;
Import Com.adups.util.CommonUtil;
Import Com.adups.util.DateUtil;
Import Org.apache.spark.sql.Dataset;
Import Org.apache.spark.sql.Row;
Import org.apache.spark.sql.SparkSession;
Import org.apache.spark.sql.functions;
Import Org.slf4j.Logger;
Import Org.slf4j.LoggerFactory;
Import Scala.collection.Seq;
Import static org.apache.spark.sql.functions.*;
Import java.io.IOException;
/** * @author Allen * Created by Allen on 03/08/2017. * * public class Deviceareaonline extends Abstractsparksql {private Logger Logger = Loggerfactory.getlogger (this.getc
Lass ()); @Override public void Executeprogram (String pt, string path, Sparksession Spark) throws IOException {string P
Repath = Dateutil.pathptwithpre (PT); String NOWPT =Dateutil.nowptday ();
String begintime = nowpt + "00:00:00";
String Endtime = nowpt + "23:59:59";
String devicetotal = Onlineofflinepath.offline_device_new_total_path + Prepath;
String deviceareatotal = Onlineofflinepath.offline_device_area_new_total_path + Prepath;
String Originareapath = Onlineofflinepath.online_device_area_new_total_path;
if (!existspath (Deviceareatotal, deviceareatotal)) {return; String where = "(select product_id as productid,device_id as deviceid,country_zh as country,province_zh as Prov Ince from Iot_register.device_info "+" where create_time between ' "+ BeginTime +" ' and ' "+ Endtime +"
') as Device_time_filter ";
dataset<row> todaydevice = new Readtable (). Loadtable (Spark, where). COALESCE (1);
dataset<row> yesterdaystats = Spark.read (). Parquet (Devicetotal). Select ("ProductId", "Totalnum"); dataset<row> totalincrement = tOdaydevice.groupby ("ProductId"). Agg (Functions.countdistinct ("DeviceId"). As ("Newnum"));
seq<string> seq = commonutil.columnnames ("ProductId");
seq<string> Nafillzero = commonutil.columnnames ("Newnum,totalnum"); dataset<row> result = Yesterdaystats.join (totalincrement, SEQ, "outer"). NA (). Fill (0, nafillzero). SE Lect (Col ("ProductId"), col ("Newnum"), col ("Newnum"). Plus (col ("Totalnum")). As ("Totalnum")). Withcolumn ("PT"
, lit (NOWPT)). COALESCE (1); dataset<row> yesterdayareastatistics = Spark.read (). Parquet (Deviceareatotal). Select ("ProductId", "Country", "
Province "," Totalnum "). TODF (); dataset<row> areaincrement = Todaydevice.groupby ("ProductId", "Country", "province"). Agg (
Functions.countdistinct ("DeviceId"). As ("Newnum"));
Seq = commonutil.columnnames ("productid,country,province"); dataset<row> Arearesult = Yesterdayareastatistics.join (areaincrement, SEQ, "outer"). NA (). Fill (0, Nafillzero). Select (Col ("ProductId"), col ("Country"), col ("province"), col ("Newnum"),
Col ("Newnum"). Plus (col ("Totalnum")). As ("Totalnum")). Withcolumn ("PT", Lit (NOWPT)). COALESCE (1);
Dataset<devicearea> Deltaarea; if (Existspath (Originareapath)) {try {dataset<row> originbase = Spark.read (). Parquet (o
Riginareapath);
Deltaarea = Arearesult.except (originbase). COALESCE (1). As (new Devicearea (). Producebeanencoder ());
catch (Exception e) {logger.error (E.getmessage ());
Deltaarea = arearesult.as (new Devicearea (). Producebeanencoder ());
} else {Deltaarea = arearesult.as (new Devicearea (). Producebeanencoder ());
try {inserttomysql (Deltaarea);
catch (Exception e) {logger.error (E.getmessage ()); } arearesult.write (). Mode ("Overwrite").Format ("Parquet"). Save (Originareapath);
Result.write (). Mode ("Overwrite"). Format ("Parquet"). Save (Onlineofflinepath.online_device_new_total_path);
} public void Inserttomysql (dataset<devicearea> Dataset) {dataset.foreachpartition (data-> {
String sql = "INSERT into Stats_device_area (product_id,country,province,new_num,total_num,pt)" + "VALUES (?,?,?,?,?,?)
On duplicate key update new_num=?,total_num=? ";
New Deviceareaonlinesave (). Putdatabatch (data, SQL);
}); public static void Main (string[] args) throws IOException {String pt = dateutil.produceptoryesterday (args)
;
Deviceareaonline deviceareaonline = new Deviceareaonline ();
Deviceareaonline.runall (PT, false);
}
}