This article mainly implements the stochastic forest algorithm in the Pyspark environment:
%pyspark from Pyspark.ml.linalg import Vectors to pyspark.ml.feature import stringindexer from Pyspark.ml.classificati On the import randomforestclassifier from pyspark.sql import Row #任务目标: Solve two classification problems through random forests and evaluate #1 of classification effects. Read data = Spark.sql ("" "Sele CT * from DataTable "" "#2. Construct Training DataSet = Data.na.fill (' 0 '). Rdd.map (list) (Traindata, testData) = Dataset.randomsplit ([0.7, 0.3]) #print (Traindata.take (1)) Trainingset = Traindata.map (lambda x:row (label=x[-1), Features=vectors.dense (x [:-1])).
TODF () Train_num = Trainingset.count () print ("Training sample: {}". Format (train_num)) print (Trainingset.show ()) #3. Training with random forests Stringindexer = Stringindexer (inputcol= "label", outputcol= "indexed") Si_model = Stringindexer.fit (trainingset) tf = Si_ Model.transform (Trainingset) tf.show () RF = Randomforestclassifier (numtrees=10, maxdepth=8, labelCol= "indexed", seed= Rfcmodel = Rf.fit (TF) #输出模型特征重要性, subtree weight print (model feature: {},. Format (rfcmodel.featureimportances)) Print (model feature: {}). Format (rfcmodel.numfeatures)) #4. Test TeStset = Testdata.map (lambda x:row (label=x[-1), Features=vectors.dense (x[:-1))). TODF () print (test sample number: {}). Format (
Testset.count ()) print (Testset.show ()) Si_model = Stringindexer.fit (testset) TEST_TF = Si_model.transform (TestSet) result = Rfcmodel.transform (TEST_TF) result.show () #5. Classification effect Evaluation Total_amount=result.count () Correct_amount = Result.filter (result.indexed==result.prediction). Count () Precision_rate = Correct_amount/total_amount print (" Prediction accuracy is: {} ". Format (precision_rate)) Positive_precision_amount = Result.filter (result.indexed = = 0). Filter ( Result.prediction = = 0). Count () Negative_precision_amount = result.filter (result.indexed = = 1). Filter ( Result.prediction = = 1). Count () Positive_false_amount = result.filter (result.indexed = = 0). filter (Result.prediction = = 1. Count () Negative_false_amount = result.filter (result.indexed = = 1). filter (Result.prediction = = 0). Count () print (" Positive sample Forecast exact number: {}, Negative sample forecast exact quantity: {} ". Format (positive_precision_amount,negative_precision_amount)) Positive_amount = ResUlt.filter (result.indexed = = 0). Count () Negative_amount = Result.filter (result.indexed = 1). Count () print (number of positive samples: {}, Negative sample number: {} ". Format (Positive_amount,negative_amount)) print (" Number of positive sample predictions error: {}, negative sample error exact quantity: {} ". Format (Positive_false_ Amount,negative_false_amount)) Recall_rate1 = Positive_precision_amount/positive_amount Recall_rate2 = negative_ Precision_amount/negative_amount print ("Positive sample recall rate is: {}, negative sample recall rate is: {}". Format (RECALL_RATE1,RECALL_RATE2))