// Load data 1 ~ 10
 
 Val num = SC. parallelize (1 to 10)
 
 // Multiply each data item by 2. Note that _ * 2 is recorded as a function (fun)
 
 Val doublenum = num. Map (_ * 2)
 
 // Memory cache data
 
 Doublenum. cache ()
 
 // Filter data. If % 3 is 0, the data is the result set;
 
 Val threenum = doublenum. Filter (_ % 3 = 0)
 
 // Release the cache
 
 Threenum. unpersist ()
 
 // Start the action to build and execute a DAG based on the previous steps, and return the result set in the form of data;
 
 Threenum. Collect
 
 // The first element in the returned result set
 
 Threenum. First
 
 // The first three elements in the returned result set
 
 Threenum. Take (3)
 
 // Calculate the number of elements in the dataset
 
 Threenum. Count
 
 // View the RDD conversion process after the preceding steps
 
 Threenum. todebugstring
 
 
 
 Result:
 
 
 
  
 
 
 
 
 // Load data
 
 Val kv1 = SC. parallelize (List ("A", 1), ("B", 2), ("C", 3), ("A", 4 ), ("B", 5 )))
 
 // Sort the data based on the K value of each element in the dataset.
 
 Kv1.sortbykey (). Collect
 
 
 
  
 
 
 Kv1.groupbykey (). Collect // grouping data based on the K value of each element in the dataset
 
  
 Kv1.performancebykey (_ + _). Collect
 
  
 Note: the differences between sortbykey, groupbykey, and performancebykey are as follows;
 
 Val kv2 = SC. parallelize (List ("A", 4), ("A", 4), ("C", 3), ("A", 4 ), ("B", 5 )))
 
 Kv2.distinct. Collect // deduplicate distinct
 
  
 Kv1.union (kv2). Collect // kv1 is associated with kv2
 
  
 Kv1.join (kv2). Collect // connection between kv1 and kv2 is equivalent to table Association.
 
  
 Val kv3 = SC. parallelize (List (1, 2), list (3, 4 )))
 
 Kv3.flatmap (x => X. Map (_ + 1). Collect // note that the returned dataset is no longer of the K-V type
 
  
  
 
 
 
 - HDFS file operation demonstration
 
 Upload the CLK. TSV and Reg. TSV files to HDFS in the following format;
 
 
 
 
 
  
 
 
 // Define a constant for date formatting
 
 Val format = new java. Text. simpledateformat ("yyyy-mm-dd ")
 
 // Scala syntax, defines the register class (according to Reg. TSV Data Format)
 
 Case class register (D: Java. util. Date, UUID: String, cust_id: String, Lat: float, LNG: Float)
 
 // Scala syntax, defines the click class (according to the CLK. TSV Data Format)
 
 Case class click (D: Java. util. Date, UUID: String, landing_page: INT)
 
 // Load the file Reg. TSV on HDFS and convert each row of data to a register object;
 
 Val Reg = SC. textfile ("HDFS: // chenx: 9000/week2/join/Reg. TSV "). map (_. split ("\ t ")). map (r => (r (1), register (format. parse (R (0), R (1), R (2), R (3 ). tofloat, R (4 ). tofloat )))
 
 // Load the CLK. TSV file on HDFS and convert each row of data to a click object;
 
 Val CLK = SC. textfile ("HDFS: // chenx: 9000/week2/join/CLK. TSV "). map (_. split ("\ t ")). map (C => (C (1), click (format. parse (C (0), C (1), C (2 ). trim. toint )))
 
 Reg. Join (CLK). Collect
 
  
 
Spark series (ii) spark shell operations and detailed descriptions