Machine learning and R language: NB

Source: Internet
Author: User
Tags dnn

#----------------------------------------# Feature Description: Demo NB Modeling process # Data set: SMS text message # TM package: Vienna University of Finance and Economics offers #---------------------------- ------------#第一步: Collect Data # import the CSV filesms_raw <-read.csv ("/users/chenyangang/r language/data/sms_spam.csv", Stringsasfactors = FALSE) #第二步: Explore and prepare data # categorical variable factorial Spam/ham sms_raw$type <-factor (sms_raw$type) # Load this Mining Pack Library (TM) # Create Corpus Sms_corpus <-Corpus (Vectorsource (Sms_raw$text)) #查看数据print (Sms_corpus) inspect (Sms_corpus[1:3]) # New Stop word stopwordvector <-C ("Supplier", "Order") # Clean up the corpus using Tm_map () Corpus_clean <-tm_map (Sms_corpus, ToLower) Corpus_clean <-tm_map (Corpus_clean, removenumbers) Corpus_clean <-tm_map (Corpus_clean, RemoveWords, Stopwords ()) Corpus_clean <-tm_map (Corpus_clean, removepunctuation) Corpus_clean <-tm_map (Corpus_clean, Stripwhitespace) #PlainTextDocument Objects #去掉新增停用词corpus_clean <-tm_map (Corpus_clean, Removewords, Stopwordvector), Last processed Corpus_plain <-tm_map (Corpus_clean, plaintextdocument) # Create sparse matrix Sms_dtm <-DocumenttermmatRix (Corpus_plain,control = list ()) # Create test data set and training dataset Sms_raw_train <-sms_raw[1:4169,]sms_raw_test <-sms_raw[ 4170:5559,] #然后是文本-word matrix Sms_dtm_train <-sms_dtm[1:4169,]sms_dtm_test <-sms_dtm[4170:5559,] #最后得到语料库sms_corpus _train <-corpus_plain[1:4169]sms_corpus_test <-corpus_plain[4170:5559]# View the training dataset and the proportion prop.table in the test Data set (table ( Sms_raw_train$type) prop.table (table (Sms_raw_test$type)) #加载词云包library (Wordcloud) #这里最好用有区分的颜色, Dark2 and Set1 in Rcolorbrewer are recommended to use Pal2 <-Brewer.pal (8, "DARK2") Wordcloud (Corpus_plain, Scale=c (3, 0.5), min.freq=10, Min.words = ten, Random.order=false, rot.per=.15, Colors=pal2) Wordcloud (sms_corpus_train, min.freq = +, Random.order = FA LSE, rot.per=.15, Colors=pal2) # Training data distinguish between spam and non-spam spam <-subset (sms_raw_train, type = = "spam") Ham <-subset (Sms_raw _train, type = = "Ham") #分别查看垃圾邮件和非垃圾邮件词云图 if you need to save the picture using the PNG method #--png (file = "/users/chenyangang/01.png", BG = "Transparent") #--dev.off () Wordcloud (spam$text, max.words = +, scale = C (3, 0.5), Random.order = FALSE, rot.per=.15, Colors=pal2) Wordcloud (ham$text, max.words = +, scale = C (3, 0.5), Random.order = FALSE, rot.per=.15, CO LORS=PAL2) # flag keywords greater than 5 times (create an indicator feature for frequently occurring words) sms_term <-termdocumentmatrix (Sms_corpus,control = List (removepunctuation = True,stopwords = TRUE) #获取次数大于5次的词组成字典 (no code) #sms_dict <-Dictionary (Findfreqterms (Sms_dtm_train, 5)) #sms_list <-Terms (Findfreqterms (Sms_term, 5)) Sms_dict <-findfreqterms (sms_term, 5) sms_train <-Documenttermmatrix ( Sms_corpus_train, list (dictionary = sms_dict)) sms_test <-Documenttermmatrix (sms_corpus_test, list (dictionary = SMS _DICT) # Convert to factor variable convert_counts <-function (x) {x <-ifelse (x > 0, 1, 0) x <-factor (x, levels = C (0, 1), LA BELs = C ("No", "yes")}# convert training data and test data by column to factor variable sms_train <-apply (sms_train, MARGIN = 2, convert_counts) Sms_test <-AP Ply (sms_test, MARGIN = 2, convert_counts) # # Third Step: Training Model #----------------------------------------------#创建分类器: # m &LT ;-Naivebayes (Train, class, Laplace = 0) # TRAin: A data frame or matrix containing training data # class: A factor vector that contains the classification of each row of the training data # Laplace: A numeric value that controls the Laplace estimate (default is 0) # This function returns a naïve Bayesian object that can be used to predict # # to make predictions: # P <-predict (M, test, type = "Class") # M: Model object trained by Naivebayes (Train, class, Laplace = 0) # Test: Data frame or package A matrix containing the test data containing the same characteristics as the training data used to build the classifier # type: The value is "class" or "raw", indicating that the prediction is the most likely category value or the original predictive probability # The function returns a vector, based on the value of the parameter type, The vector contains the predicted category value or the original predictive probability # example:# sms_classifier <-naivebayes (Sms_train, Sms_raw_train$type) # SMS_ Test_pred <-Predict (Sms_classifier, sms_test) #----------------------------------------------library (e1071) SMS _classifier <-Naivebayes (Sms_train, Sms_raw_train$type) sms_classifier## Fourth Step: Evaluate model performance sms_test_pred <-predict ( Sms_classifier, Sms_test) library (gmodels) CrossTable (sms_test_pred, sms_raw_test$type, prop.chisq = TRUE, prop.t = True, PROP.R = true, DNN = C (' predicted ', ' actual ')) # # Fifth Step: Improve the performance of the model (applying Laplace estimates: Essentially, adding a smaller number to each count of the frequency) Sms_classifie R2 <-Naivebayes (Sms_train, Sms_raw_train$type, Laplace= 1) sms_test_pred2 <-predict (Sms_classifier2, sms_test) CrossTable (Sms_test_pred2, Sms_raw_test$type, prop.ch ISQ = False, prop.t = False, PROP.R = false, DNN = C (' predicted ', ' actual ')

  

Machine learning and R language: NB

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.