#----------------------------------------# Feature Description: Demo NB Modeling process # Data set: SMS text message # TM package: Vienna University of Finance and Economics offers #---------------------------- ------------#第一步: Collect Data # import the CSV filesms_raw <-read.csv ("/users/chenyangang/r language/data/sms_spam.csv", Stringsasfactors = FALSE) #第二步: Explore and prepare data # categorical variable factorial Spam/ham sms_raw$type <-factor (sms_raw$type) # Load this Mining Pack Library (TM) # Create Corpus Sms_corpus <-Corpus (Vectorsource (Sms_raw$text)) #查看数据print (Sms_corpus) inspect (Sms_corpus[1:3]) # New Stop word stopwordvector <-C ("Supplier", "Order") # Clean up the corpus using Tm_map () Corpus_clean <-tm_map (Sms_corpus, ToLower) Corpus_clean <-tm_map (Corpus_clean, removenumbers) Corpus_clean <-tm_map (Corpus_clean, RemoveWords, Stopwords ()) Corpus_clean <-tm_map (Corpus_clean, removepunctuation) Corpus_clean <-tm_map (Corpus_clean, Stripwhitespace) #PlainTextDocument Objects #去掉新增停用词corpus_clean <-tm_map (Corpus_clean, Removewords, Stopwordvector), Last processed Corpus_plain <-tm_map (Corpus_clean, plaintextdocument) # Create sparse matrix Sms_dtm <-DocumenttermmatRix (Corpus_plain,control = list ()) # Create test data set and training dataset Sms_raw_train <-sms_raw[1:4169,]sms_raw_test <-sms_raw[ 4170:5559,] #然后是文本-word matrix Sms_dtm_train <-sms_dtm[1:4169,]sms_dtm_test <-sms_dtm[4170:5559,] #最后得到语料库sms_corpus _train <-corpus_plain[1:4169]sms_corpus_test <-corpus_plain[4170:5559]# View the training dataset and the proportion prop.table in the test Data set (table ( Sms_raw_train$type) prop.table (table (Sms_raw_test$type)) #加载词云包library (Wordcloud) #这里最好用有区分的颜色, Dark2 and Set1 in Rcolorbrewer are recommended to use Pal2 <-Brewer.pal (8, "DARK2") Wordcloud (Corpus_plain, Scale=c (3, 0.5), min.freq=10, Min.words = ten, Random.order=false, rot.per=.15, Colors=pal2) Wordcloud (sms_corpus_train, min.freq = +, Random.order = FA LSE, rot.per=.15, Colors=pal2) # Training data distinguish between spam and non-spam spam <-subset (sms_raw_train, type = = "spam") Ham <-subset (Sms_raw _train, type = = "Ham") #分别查看垃圾邮件和非垃圾邮件词云图 if you need to save the picture using the PNG method #--png (file = "/users/chenyangang/01.png", BG = "Transparent") #--dev.off () Wordcloud (spam$text, max.words = +, scale = C (3, 0.5), Random.order = FALSE, rot.per=.15, Colors=pal2) Wordcloud (ham$text, max.words = +, scale = C (3, 0.5), Random.order = FALSE, rot.per=.15, CO LORS=PAL2) # flag keywords greater than 5 times (create an indicator feature for frequently occurring words) sms_term <-termdocumentmatrix (Sms_corpus,control = List (removepunctuation = True,stopwords = TRUE) #获取次数大于5次的词组成字典 (no code) #sms_dict <-Dictionary (Findfreqterms (Sms_dtm_train, 5)) #sms_list <-Terms (Findfreqterms (Sms_term, 5)) Sms_dict <-findfreqterms (sms_term, 5) sms_train <-Documenttermmatrix ( Sms_corpus_train, list (dictionary = sms_dict)) sms_test <-Documenttermmatrix (sms_corpus_test, list (dictionary = SMS _DICT) # Convert to factor variable convert_counts <-function (x) {x <-ifelse (x > 0, 1, 0) x <-factor (x, levels = C (0, 1), LA BELs = C ("No", "yes")}# convert training data and test data by column to factor variable sms_train <-apply (sms_train, MARGIN = 2, convert_counts) Sms_test <-AP Ply (sms_test, MARGIN = 2, convert_counts) # # Third Step: Training Model #----------------------------------------------#创建分类器: # m < ;-Naivebayes (Train, class, Laplace = 0) # TRAin: A data frame or matrix containing training data # class: A factor vector that contains the classification of each row of the training data # Laplace: A numeric value that controls the Laplace estimate (default is 0) # This function returns a naïve Bayesian object that can be used to predict # # to make predictions: # P <-predict (M, test, type = "Class") # M: Model object trained by Naivebayes (Train, class, Laplace = 0) # Test: Data frame or package A matrix containing the test data containing the same characteristics as the training data used to build the classifier # type: The value is "class" or "raw", indicating that the prediction is the most likely category value or the original predictive probability # The function returns a vector, based on the value of the parameter type, The vector contains the predicted category value or the original predictive probability # example:# sms_classifier <-naivebayes (Sms_train, Sms_raw_train$type) # SMS_ Test_pred <-Predict (Sms_classifier, sms_test) #----------------------------------------------library (e1071) SMS _classifier <-Naivebayes (Sms_train, Sms_raw_train$type) sms_classifier## Fourth Step: Evaluate model performance sms_test_pred <-predict ( Sms_classifier, Sms_test) library (gmodels) CrossTable (sms_test_pred, sms_raw_test$type, prop.chisq = TRUE, prop.t = True, PROP.R = true, DNN = C (' predicted ', ' actual ')) # # Fifth Step: Improve the performance of the model (applying Laplace estimates: Essentially, adding a smaller number to each count of the frequency) Sms_classifie R2 <-Naivebayes (Sms_train, Sms_raw_train$type, Laplace= 1) sms_test_pred2 <-predict (Sms_classifier2, sms_test) CrossTable (Sms_test_pred2, Sms_raw_test$type, prop.ch ISQ = False, prop.t = False, PROP.R = false, DNN = C (' predicted ', ' actual ')
Machine learning and R language: NB