Operating Environment
Cluster Environment: CDH5.3.0
The specific jar versions are as follows:
Spark version: 1.2.0-cdh5.3.0
Hive Version: 0.13.1-cdh5.3.0
Hadoop version: 2.5.0-cdh5.3.0
Simple Java version of Spark SQL sample
Spark SQL directly queries JSON-formatted data
Custom functions for Spark SQL
Spark SQL queries the table above hive
import java.util.arraylist;import java.util.list;import org.apache.spark.sparkconf;import org.apache.spark.api.java.javardd;import org.apache.spark.api.java.javasparkcontext;import org.apache.spark.api.java.function.function;import org.apache.spark.sql.api.java.datatype;import Org.apache.spark.sql.api.java.javasqlcontext;import org.apache.spark.sql.api.java.javaschemardd;import org.apache.spark.sql.api.java.Row;import org.apache.spark.sql.api.java.UDF1;import org.apache.spark.sql.hive.api.java.javahivecontext;/** * note: * when using JavaHiveContext * 1: Need to add three configuration files under Classpath: hive-site.xml,core-site.xml,hdfs-site.xml * 2: Need to increase dependency on PostgreSQL or MySQL driver package * 3: need to increase hive-jdbc,hive-exec dependency * */public class Simpledemo { public static void main (String[] args) { sparkconf conf&nbSp;= new sparkconf (). Setappname ("Simpledemo"). Setmaster ("local"); javasparkcontext sc = new javasparkcontext (conf); javasqlcontext sqlctx = new javasqlcontext (SC); javahivecontext hivectx = new javahivecontext (SC);// testqueryjson (SQLCTX);// &NBSP;&NBSP;TESTUDF (SC,&NBSP;SQLCTX); testhive (HIVECTX); sc.stop (); sc.close () ; } //test Spark sql directly query JSON-formatted data Public static void testqueryjson (JAVASQLCONTEXT&NBSP;SQLCTX) { javaschemarDd rdd = sqlctx.jsonfile ("File:///D:/tmp/tmp/json.txt"); rdd.printschema (); // register the Input schema rdd rdd.registertemptable ("account"); javaschemardd accs = sqlctx.sql ("SELECT Address, email,id,name from account order by id limit 10 "); list<row> result = accs.collect (); for (Row row : result) { system.out.println (row.getstring (0) + "," + row.getstring (1) + "," + row.getint (2) + "," + row.getstring (3)); } javardd<string> names = accs.map (new function<row, string> () { @Override public string call (Row row) throws Exception { return row.getstring (3); } }); system.out.println (Names.collect ()); } //Test Spark sql Custom Function public static &NBSP;VOID&NBSP;TESTUDF (JavasparkcontEXT&NBSP;SC,&NBSP;JAVASQLCONTEXT&NBSP;SQLCTX) { // create a account and turn it into a schema rdd ArrayList<AccountBean> accList = new ArrayList< Accountbean> (); acclist.add (New accountbean (1, " Lily ", " [email protected] ", " Gz tianhe "); Javardd<accountbean> accrdd = sc.parallelize (acclist); javaschemardd rdd = sqlctx.applyschema (Accrdd, accountbean.class); rdd.registertemptable ("ACC"); // Writing Custom Function Udf sqlctx.registerfunction (" Strlength ", new udf1<String, integer> () { @ Override public integer call ( STRING&NBSP;STR) throws Exception { return str.length (); } }, datatype.integertype); // data Query List< Row> result = sqlctx.sql ("Select strlength (' name '), NAME,ADDRESS&NBSP;FROM&NBSP;ACC limit 10 "). Collect (); for (Row row : result) { system.out.println (Row.getint (0) + "," + row.getstring (1) + "," + row.getstring (2)); } } //Test spark sql Query the table above hive public static void testhive (JAVAHIVECONTEXT&NBSP;HIVECTX) { list<row> result = hivectx.sql ("SELECT foo,bar, Name from pokes2 limit 10 "). Collect (); for (Row row : result) { system.out.println (row.getstring (0) + "," + row.getstring (1) + "," + row.getstring (2)); } }}
Spark SQL Simple Example