// Load data 1 ~ 10
Val num = SC. parallelize (1 to 10)
// Multiply each data item by 2. Note that _ * 2 is recorded as a function (fun)
Val doublenum = num. Map (_ * 2)
// Memory cache data
Doublenum. cache ()
// Filter data. If % 3 is 0, the data is the result set;
Val threenum = doublenum. Filter (_ % 3 = 0)
// Release the cache
Threenum. unpersist ()
// Start the action to build and execute a DAG based on the previous steps, and return the result set in the form of data;
Threenum. Collect
// The first element in the returned result set
Threenum. First
// The first three elements in the returned result set
Threenum. Take (3)
// Calculate the number of elements in the dataset
Threenum. Count
// View the RDD conversion process after the preceding steps
Threenum. todebugstring
Result:
// Load data
Val kv1 = SC. parallelize (List ("A", 1), ("B", 2), ("C", 3), ("A", 4 ), ("B", 5 )))
// Sort the data based on the K value of each element in the dataset.
Kv1.sortbykey (). Collect
Kv1.groupbykey (). Collect // grouping data based on the K value of each element in the dataset
Kv1.performancebykey (_ + _). Collect
Note: the differences between sortbykey, groupbykey, and performancebykey are as follows;
Val kv2 = SC. parallelize (List ("A", 4), ("A", 4), ("C", 3), ("A", 4 ), ("B", 5 )))
Kv2.distinct. Collect // deduplicate distinct
Kv1.union (kv2). Collect // kv1 is associated with kv2
Kv1.join (kv2). Collect // connection between kv1 and kv2 is equivalent to table Association.
Val kv3 = SC. parallelize (List (1, 2), list (3, 4 )))
Kv3.flatmap (x => X. Map (_ + 1). Collect // note that the returned dataset is no longer of the K-V type
- HDFS file operation demonstration
Upload the CLK. TSV and Reg. TSV files to HDFS in the following format;
// Define a constant for date formatting
Val format = new java. Text. simpledateformat ("yyyy-mm-dd ")
// Scala syntax, defines the register class (according to Reg. TSV Data Format)
Case class register (D: Java. util. Date, UUID: String, cust_id: String, Lat: float, LNG: Float)
// Scala syntax, defines the click class (according to the CLK. TSV Data Format)
Case class click (D: Java. util. Date, UUID: String, landing_page: INT)
// Load the file Reg. TSV on HDFS and convert each row of data to a register object;
Val Reg = SC. textfile ("HDFS: // chenx: 9000/week2/join/Reg. TSV "). map (_. split ("\ t ")). map (r => (r (1), register (format. parse (R (0), R (1), R (2), R (3 ). tofloat, R (4 ). tofloat )))
// Load the CLK. TSV file on HDFS and convert each row of data to a click object;
Val CLK = SC. textfile ("HDFS: // chenx: 9000/week2/join/CLK. TSV "). map (_. split ("\ t ")). map (C => (C (1), click (format. parse (C (0), C (1), C (2 ). trim. toint )))
Reg. Join (CLK). Collect
Spark series (ii) spark shell operations and detailed descriptions