Machine learning and R language: NB

Last Update:2016-04-01 Source: Internet

Author: User

Tags dnn

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

#----------------------------------------# Feature Description: Demo NB Modeling process # Data set: SMS text message # TM package: Vienna University of Finance and Economics offers #---------------------------- ------------#第一步: Collect Data # import the CSV filesms_raw <-read.csv ("/users/chenyangang/r language/data/sms_spam.csv", Stringsasfactors = FALSE) #第二步: Explore and prepare data # categorical variable factorial Spam/ham sms_raw$type <-factor (sms_raw$type) # Load this Mining Pack Library (TM) # Create Corpus Sms_corpus <-Corpus (Vectorsource (Sms_raw$text)) #查看数据print (Sms_corpus) inspect (Sms_corpus[1:3]) # New Stop word stopwordvector <-C ("Supplier", "Order") # Clean up the corpus using Tm_map () Corpus_clean <-tm_map (Sms_corpus, ToLower) Corpus_clean <-tm_map (Corpus_clean, removenumbers) Corpus_clean <-tm_map (Corpus_clean, RemoveWords, Stopwords ()) Corpus_clean <-tm_map (Corpus_clean, removepunctuation) Corpus_clean <-tm_map (Corpus_clean, Stripwhitespace) #PlainTextDocument Objects #去掉新增停用词corpus_clean <-tm_map (Corpus_clean, Removewords, Stopwordvector), Last processed Corpus_plain <-tm_map (Corpus_clean, plaintextdocument) # Create sparse matrix Sms_dtm <-DocumenttermmatRix (Corpus_plain,control = list ()) # Create test data set and training dataset Sms_raw_train <-sms_raw[1:4169,]sms_raw_test <-sms_raw[ 4170:5559,] #然后是文本-word matrix Sms_dtm_train <-sms_dtm[1:4169,]sms_dtm_test <-sms_dtm[4170:5559,] #最后得到语料库sms_corpus _train <-corpus_plain[1:4169]sms_corpus_test <-corpus_plain[4170:5559]# View the training dataset and the proportion prop.table in the test Data set (table ( Sms_raw_train$type) prop.table (table (Sms_raw_test$type)) #加载词云包library (Wordcloud) #这里最好用有区分的颜色, Dark2 and Set1 in Rcolorbrewer are recommended to use Pal2 <-Brewer.pal (8, "DARK2") Wordcloud (Corpus_plain, Scale=c (3, 0.5), min.freq=10, Min.words = ten, Random.order=false, rot.per=.15, Colors=pal2) Wordcloud (sms_corpus_train, min.freq = +, Random.order = FA LSE, rot.per=.15, Colors=pal2) # Training data distinguish between spam and non-spam spam <-subset (sms_raw_train, type = = "spam") Ham <-subset (Sms_raw _train, type = = "Ham") #分别查看垃圾邮件和非垃圾邮件词云图 if you need to save the picture using the PNG method #--png (file = "/users/chenyangang/01.png", BG = "Transparent") #--dev.off () Wordcloud (spam$text, max.words = +, scale = C (3, 0.5), Random.order = FALSE, rot.per=.15, Colors=pal2) Wordcloud (ham$text, max.words = +, scale = C (3, 0.5), Random.order = FALSE, rot.per=.15, CO LORS=PAL2) # flag keywords greater than 5 times (create an indicator feature for frequently occurring words) sms_term <-termdocumentmatrix (Sms_corpus,control = List (removepunctuation = True,stopwords = TRUE) #获取次数大于5次的词组成字典 (no code) #sms_dict <-Dictionary (Findfreqterms (Sms_dtm_train, 5)) #sms_list <-Terms (Findfreqterms (Sms_term, 5)) Sms_dict <-findfreqterms (sms_term, 5) sms_train <-Documenttermmatrix ( Sms_corpus_train, list (dictionary = sms_dict)) sms_test <-Documenttermmatrix (sms_corpus_test, list (dictionary = SMS _DICT) # Convert to factor variable convert_counts <-function (x) {x <-ifelse (x > 0, 1, 0) x <-factor (x, levels = C (0, 1), LA BELs = C ("No", "yes")}# convert training data and test data by column to factor variable sms_train <-apply (sms_train, MARGIN = 2, convert_counts) Sms_test <-AP Ply (sms_test, MARGIN = 2, convert_counts) # # Third Step: Training Model #----------------------------------------------#创建分类器: # m &LT ;-Naivebayes (Train, class, Laplace = 0) # TRAin: A data frame or matrix containing training data # class: A factor vector that contains the classification of each row of the training data # Laplace: A numeric value that controls the Laplace estimate (default is 0) # This function returns a naïve Bayesian object that can be used to predict # # to make predictions: # P <-predict (M, test, type = "Class") # M: Model object trained by Naivebayes (Train, class, Laplace = 0) # Test: Data frame or package A matrix containing the test data containing the same characteristics as the training data used to build the classifier # type: The value is "class" or "raw", indicating that the prediction is the most likely category value or the original predictive probability # The function returns a vector, based on the value of the parameter type, The vector contains the predicted category value or the original predictive probability # example:# sms_classifier <-naivebayes (Sms_train, Sms_raw_train$type) # SMS_ Test_pred <-Predict (Sms_classifier, sms_test) #----------------------------------------------library (e1071) SMS _classifier <-Naivebayes (Sms_train, Sms_raw_train$type) sms_classifier## Fourth Step: Evaluate model performance sms_test_pred <-predict ( Sms_classifier, Sms_test) library (gmodels) CrossTable (sms_test_pred, sms_raw_test$type, prop.chisq = TRUE, prop.t = True, PROP.R = true, DNN = C (' predicted ', ' actual ')) # # Fifth Step: Improve the performance of the model (applying Laplace estimates: Essentially, adding a smaller number to each count of the frequency) Sms_classifie R2 <-Naivebayes (Sms_train, Sms_raw_train$type, Laplace= 1) sms_test_pred2 <-predict (Sms_classifier2, sms_test) CrossTable (Sms_test_pred2, Sms_raw_test$type, prop.ch ISQ = False, prop.t = False, PROP.R = false, DNN = C (' predicted ', ' actual ')

Machine learning and R language: NB

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More