Hadoop fs-put/home/wangxiao/data/ml/affairs.csv/datafile/wangxiao/hadoop fs-ls-r/datafiledrwxr-xr-x-Wangxiao sup Ergroup 0 2016-10-15 10:46/datafile/wangxiao-rw-r--r--3 wangxiao supergroup 16755 2016-10-15 10:46/data file/wangxiao/affairs.csv-rw-r--r--3 wangxiao supergroup 16755 2016-10-13 21:48/datafile/wangxiao/affairs.txt//A Ffairs: Travel alone for a year//gender: Gender//Age: Ages//yearsmarried: Marriage//children: whether there are children//religiousness: Degree of religious belief (5 points, 1 points against, 5 points for a very religious belief)//Education: Education//Occupation: Occupation (reverse numbered Gordon 7 categories)//rating: Self-rating of marriage (5 points, 1 means very unhappy, 5 is very happy) import Org.apache.spark.sql.SparkSessionimport Org.apache.spark.sql.DataFrameimport Org.apache.spark.rdd.RDDimport Org.apache.spark.sql.catalyst.encoders.ExpressionEncoderimport org.apache.spark.sql.Encoderobject ML1 {def main ( Args:array[string]) {val spark = Sparksession.builder (). AppName ("Spark SQL basic Example"). config ("Spark.some.config. Option "," Some-value "). Getorcreate ()//For implicit conversions like COnverting RDDs to Dataframes import spark.implicits._//Create data frame//Val data1:dataframe=spark.read.csv ("hdfs://ns1/ Datafile/wangxiao/affairs.csv ") Val data1:dataframe = Spark.read.format (" CSV "). Load (" hdfs://ns1/datafile/wangxiao/ Affairs.csv ") Val df = data1.todf (" Affairs "," Gender "," Age "," yearsmarried "," Children "," religiousness "," Education ", "Occupation", "rating") Df.printschema ()//##############################################//Specify Field name and field type case Class affairs (Affairs:int, Gender:string, Age:int, yearsmarried:double, children:string, Religi Ousness:int, Education:double, Occupation:double, rating:int) val res1 = data1.rdd.map {R =& Gt Affairs (R (0). ToString (). ToInt, R (1). ToString (), R (2). ToString (). ToInt, R (3). ToString (). ToDouble, R (4). ToString (), R (5). ToString (). ToInt, R (6). ToString (). ToDouble, R (7). ToString (). ToDouble, R (8). ToString (). ToInt)}.TODF () Res 1.printSchema () ################################################//Create RDD val data2:rdd[string] = Spark.sparkContext.textFile ( "Hdfs://ns1/datafile/wangxiao/affairs.txt") Case class Affairs1 (Affairs:int, gender:string, Age:int, Yearsmarried:double, Children:string, Religiousness:int, education:double, Occupation: Double, Rating:int)//RDD Convert to Data frame val Res2 = data2.map {_.split ("")}.map {line + Affairs1 (line (0). ToInt , line (1). trim.tostring (), line (2). ToInt, Line (3). ToDouble, Line (4). trim.tostring (), line (5). ToInt, Line (6). ToDouble, Line (7). ToDouble, Line (8). ToInt)}.TODF ()//###############################################//CREATE VIEW Df.createorreplacetempview ("affairs")//subquery//val DF1 = Spark.sql ("SELECT * from Affairs WHERE age between 2 5 ") Val df1 = Spark.sql (" Select Gender, age,rating from (SELECT * from Affairs WHERE age between) T ") Df1.show//Save data frame to file Df.select ("Gender", "Age", "education"). Write.format ("CSV"). Save ("Hdfs://ns1/datafile/wangxiao/data123.csv")}} Hadoop fs-ls-r/datafiledrwxr-xr-x-wangxiao supergroup 0 2016-10-15 11:43/datafile/wangxiao-rw-r--r--3 Wangxiao supergroup 16755 2016-10-15 10:46/datafile/wangxiao/affairs.csv-rw-r--r--3 wangxiao supergroup 1675 5 2016-10-13 21:48/datafile/wangxiao/affairs.txtdrwxr-xr-x-wangxiao supergroup 0 2016-10-15 11:43/datafile/ Wangxiao/data123.csv
SPARK2 load Save file, convert data file into data frame Dataframe