Import Java.util.Arrays; Import org.apache.spark.SparkConf; Import Org.apache.spark.api.java.JavaRDD; Import Org.apache.spark.api.java.JavaSparkContext; Import org.apache.spark.api.java.function.Function; Import Org.apache.spark.mllib.classification.LogisticRegressionModel; Import Org.apache.spark.mllib.classification.LogisticRegressionWithSGD; Import Org.apache.spark.mllib.feature.HashingTF; Import Org.apache.spark.mllib.linalg.Vector; Import Org.apache.spark.mllib.regression.LabeledPoint; /** * Created by Hui on 2017/11/29. */ public class MLlib { public static void Main (string[] args) { sparkconf sparkconf = new sparkconf (). Setappname ("Javabookexample"). Setmaster ("local"); Javasparkcontext sc = new Javasparkcontext (sparkconf); Load 2 types of emails from text files:spam and Ham (non-spam). Each of the line have text from one email. javardd<string> spam = Sc.textfile ("Files/spam.txt"); javardd<string> ham = sc.textfile ("Files/ham.txt"); Create a HASHINGTF instance to map e-mail text to vectors of features. Final HASHINGTF tf = new HASHINGTF (100); Each e-mail is a split into words with each word being mapped to one feature. Create labeledpoint datasets for positive (spam) and negative (ham) examples. javardd<labeledpoint> positiveexamples = Spam.map (New function<string, labeledpoint> () { @Override Public labeledpoint Call (String email) { return new Labeledpoint (1, Tf.transform (Arrays.aslist (Email.split (")))); } }); javardd<labeledpoint> negativeexamples = Ham.map (New function<string, labeledpoint> () { @Override Public labeledpoint Call (String email) { return new Labeledpoint (0, Tf.transform (arrays.aslist (Email.split (")))); } }); javardd<labeledpoint> trainingdata = positiveexamples.union (negativeexamples); Trainingdata.cache (); Cache data since Logistic Regression is an iterative algorithm. Create a Logistic Regression learner which uses the Lbfgs optimizer. LOGISTICREGRESSIONWITHSGD Lrlearner = new LOGISTICREGRESSIONWITHSGD (); Run The actual learning algorithm on the training data. Logisticregressionmodel model = Lrlearner.run (Trainingdata.rdd ()); Test on a positive example (spam) and a negative one (ham). First apply the same HASHINGTF feature transformation used on the training data. Vector postestexample = Tf.transform (Arrays.aslist ("O M G GET cheap stuff by sending ...". Split (""))); Vector negtestexample = Tf.transform (Arrays.aslist ("Hi Dad, I started studying Spark the other ...". Split (""))); Now with the learned model to predict Spam/ham for new emails. SYSTEM.OUT.PRINTLN ("Prediction for positive test example:" + model.predict (postestexample)); SYSTEM.OUT.PRINTLN ("Prediction for negative Test example:" + model.predict (negtestexample)); Sc.stop (); } } |