# Load Sparkr Package Library (SPARKR) # Initialize Spark context SC <-sparkr.init (master= "host:7077", Appname= ' Sparkr_logistic_regression ', Sparkenvir=list (spark.executor.memory= ' 1g ', spark.cores.max= "10")) # Read TXT file from HDFs, consisting of 4 partitions from the Spark cluster Input_rdd <-Textfile (SC, "hdfs://Cluster IP: Port/user/payton/german.data-numeric.txt", MINSPLITS=4) # parse the text of each RDD element (parallel on each partition) Dataset_rdd <-lapplypartition (Input_rdd, function (part) { Part <-lapply (part, function (x) unlist (Strsplit (x, ' \\s ')) Part <-lapply (part, function (x) as.numeric (x[x! = "]) Part }) # We need to split the dataset Dataset_rdd into two parts of the training set (train) and test set, where # ptest is a sample scale for the test set, such as taking ptest=0.2, which takes the 20% samples of Dataset_rdd as the test # Set, 80% sample count as training set Split_dataset <-function (Rdd, ptest) { #以输入样本数ptest比例创建测试集RDD Data_test_rdd <-lapplypartition (RDD, function (part) { Part_test <-part[1: (Length (part) *ptest)] Part_test }) # Create a training set RDD with the remaining number of samples Data_train_rdd <-lapplypartition (RDD, function (part) { Part_train <-part[((Length (part) *ptest) +1): Length (part)] Part_train }) # return the list of test set RDD and training set RDD List (Data_test_rdd, Data_train_rdd) } # Next we need to convert the dataset into the matrix of the R language, and add a column with a number of 1 intercept items, # normalize the output item Y to form 0/1 Get_matrix_rdd <-Function (RDD) { Matrix_rdd <-lapplypartition (RDD, function (part) { M <-Matrix (Data=unlist (part, F, f), ncol=25, byrow=t) M <-cbind (1, m) M[,ncol (m)] <-M[,ncol (m)]-1 M }) Matrix_rdd } # because the value of Y in this training set is 1 and the sample number of 0 is 7:3, we need to balance 1 and 0 samples # number so that they match the number of samples Balance_matrix_rdd <-Function (Matrix_rdd) { Balanced_matrix_rdd <-lapplypartition (Matrix_rdd, function (part) { Y <-part[,26] Index <-sample (which (y==0), Length (which (y==1))) Index <-C (index, which (y==1)) Part <-Part[index,] Part }) Balanced_matrix_rdd } # Split data set for training and test sets DataSet <-Split_dataset (Dataset_rdd, 0.2) # Create Test Set RDD Matrix_test_rdd <-Get_matrix_rdd (dataset[[1]) # Create training Set RDD Matrix_train_rdd <-Balance_matrix_rdd (Get_matrix_rdd (dataset[[2])) # put the training set RDD and test set RDD into spark distributed cluster memory Cache (MATRIX_TEST_RDD) Cache (MATRIX_TRAIN_RDD) # initialization Vector Theta theta<-runif (n=25, min =-1, max = 1) # Logistic functions Hypot <-function (z) { 1/(1+exp (-Z)) } # gradient calculation of loss function Gcost <-Function (t,x,y) { 1/nrow (x) * (t (x)%*% (Hypot (x%*%t)-y)) } # define Training functions Train <-function (theta, RDD) { # Calculate gradients Gradient_rdd <-lapplypartition (RDD, function (part) { X <-part[,1:25] Y <-part[,26] P_gradient <-gcost (theta,x,y) List (list (1, p_gradient)) }) Agg_gradient_rdd <-Reducebykey (gradient_rdd, ' + ', 1L) # One Iteration aggregation output Collect (Agg_gradient_rdd) [[1]][[2] } # optimized loss function by gradient descent algorithm # Alpha: Learning rate # steps: Number of iterations # Tol: Convergence accuracy Alpha <-0.1 Tol <-1e-4 Step <-1 while (T) { Cat ("Step:", step, "\ n") P_gradient <-Train (theta, Matrix_train_rdd) Theta <-Theta-alpha*p_gradient Gradient <-train (theta, Matrix_train_rdd) if (ABS (Norm (gradient,type= "F")-norm (p_gradient,type= "F") <=tol) break Step <-step+1 } # Use a trained model to predict test set credit evaluation results ("good" or "bad") and calculate forecast accuracy Test <-lapplypartition (Matrix_test_rdd, function (part) { X <-part[,1:25] Y <-part[,26] y_pred <-Hypot (X%*%theta) Result <-xor (As.vector (Round (y_pred)), As.vector (y)) }) Result<-unlist (collect (test)) corrects = Length (result[result==f]) Wrongs = Length (result[result==t]) Cat ("\ncorrects:", corrects, "\ n") Cat ("Wrongs:", wrongs, "\ n") Cat ("Accuracy:", corrects/length (y_pred), "\ n") |