# # # #需要先安装几个R包, if you have these packages, you can omit the steps to install the package.
#install. Packages ("Rwordseg")
#install. Packages ("TM");
#install. Packages ("Wordcloud");
#install. Packages ("Topicmodels")
The data used in the example
data from Sougou laboratory data. data URL:http://download.labs.sogou.com/dl/sogoulabdown/SogouC.mini.20061102.tar.gz File Structure└─Sample ├─C000007 car├─C000008 Finance├─C000010 IT ├─C000013 Health├─C000014 Sports├─C000016 Tour├─C000020 Education├─C000022 Recruitment├─C000023 └─C000024 militaryuse Python to preprocess the data to a train. csv file and process each file text into 1 rows.
Preprocessing Python scripts
<ignore_js_op> combinesample.zip (720 Bytes, download count:)
The data you need
<ignore_js_op> train.zip (130.2 KB, downloaded: 164)
You can also use R to directly transform raw data into train.csv data.
Article Required Stopwords
<ignore_js_op> stopwords.zip (2.96 KB, download times:)
1. Read the database
- CSV <-read.csv ("D://wb//train.csv", Header=t, Stringsasfactors=f)
- mystopwords<-unlist (read.table ("D://wb//stopwords.txt", Stringsasfactors=f))
Copy Code
2.
Data preprocessing (Chinese word segmentation, stopwords processing)
- Library (tm);
- #移除数字
- RemoveNumbers = function (x) {ret = Gsub ("[0-90123456789]", "" ", X)}
- Sample.words <-lapply (csv$$$ $text, removenumbers)
Copy Code
- #处理中文分词, the RWORDSEG package is used here
- wordsegment<-function (x) {
- Library (RWORDSEG)
- SEGMENTCN (x)
- }
- Sample.words <-lapply (sample.words, wordsegment)
Copy Code
- # # #stopwords处理
- # # #先处理中文分词, then process stopwords to prevent global replacement of lost information
- Removestopwords = function (x,words) {
- ret = character (0)
- Index <-1
- It_max <-Length (x)
- while (index <= It_max) {
- if (length (words[words==x[index)]) <1) RET <-C (Ret,x[index])
- Index <-Index +1
- }
- Ret
- }
- Sample.words <-lapply (sample.words, Removestopwords, Mystopwords)
Copy Code
3. Wordcloud Display
- #构建语料库
- Corpus = Corpus (Vectorsource (sample.words))
- Meta (corpus, "cluster") <-csv$$$ $type
- Unique_type <-Unique (csv$$$ $type)
- #建立文档-Entry matrix
- (Sample.dtm <-Documenttermmatrix (corpus, control = list (Wordlengths = C (2, INF))))
Copy Code
- #install. Packages ("Wordcloud"); # #需要wordcloud包的支持
- Library (Wordcloud);
- #不同文档wordcloud对比图
- SAMPLE.TDM <-Termdocumentmatrix (corpus, control = list (Wordlengths = C (2, INF)));
- Tdm_matrix <-As.matrix (SAMPLE.TDM);
- PNG (Paste ("D://wb//sample_comparison", ". png", Sep = ""), width =, height = 1500);
- Comparison.cloud (Tdm_matrix,colors=rainbow (Ncol (Tdm_matrix)); # # # #由于颜色问题, slightly modified
- Title (main = "Sample comparision");
- Dev.off ();
Copy Code
- #按分类汇总wordcloud对比图
- n <-nrow (CSV)
- ZZ1 = 1:n
- Cluster_matrix<-sapply (unique_type,function (type) {apply (tdm_matrix[,zz1[csv$$$ $type ==type]],1,sum)})
- PNG (Paste ("D://wb//sample_ Cluster_comparison", ". png", Sep = ""), Width = +, height = 800)
- Comparison.cloud (Cluster_matrix,colors=brewer.pal (Ncol (Cluster_matrix), "paired")) # #由于颜色分类过少, modify here slightly
- Title (main = "Sample cluster comparision")
- Dev.off ()
Copy Code
<ignore_js_op>
It can be seen that data distribution is not uniform, culture, auto and other data is very small.
- #按各分类画wordcloud
- Sample.cloud <-function (cluster, maxwords = 100) {
- Words <-Sample.words[which (csv$$$ $type ==cluster)]
- Allwords <-unlist (words)
- Wordsfreq <-Sort (table (allwords), decreasing = T)
- Wordsname <-names (wordsfreq)
- PNG (Paste ("D://wb//sample_", Cluster, ". png", Sep = ""), Width = $, height = 600)
- Wordcloud (Wordsname, wordsfreq, scale = C (6, 1.5), Min.freq = 2, max.words = maxwords, colors = Rainbow (100))
- Title (main = Paste ("cluster:", cluster))
- Dev.off ()
- }
- Lapply (Unique_type,sample.cloud) # unique (csv$$$ $type)
Copy Code
<ignore_js_op>
<ignore_js_op>
4. Thematic model analysis
- Library (SLAM)
- Summary (Col_sums (SAMPLE.DTM))
- TERM_TFIDF <-tapply (sample.dtm$$$ $v/row_sums (SAMPLE.DTM) [sample.dtm$$$ $i], sample.dtm$$$ $j, mean) *
- Log2 (Ndocs (SAMPLE.DTM)/col_sums (Sample.dtm > 0))
- Summary (TERM_TFIDF)
- Sample.dtm <-sample.dtm[, TERM_TFIDF >= 0.1]
- Sample.dtm <-sample.dtm[row_sums (Sample.dtm) > 0,]
- Library (Topicmodels)
- K <-30
- SEED <-2010
- Sample_tm <-
- List
- VEM = LDA (Sample.dtm, k = k, control = list (seed = seed)),
- vem_fixed = LDA (Sample.dtm, k = K,control = List (Estimate.alpha = FALSE, seed = Seed),
- Gibbs = LDA (Sample.dtm, k = k, method = "Gibbs", control = list (seed = seed, Burnin = 1000,thin = +, iter = 1000)),
- CTM = CTM (Sample.dtm, k = K,control = List (seed = Seed,var = List (tol = 10^-4), em = list (tol = 10^-3))
- )
Copy Code
<ignore_js_op>
- Sapply (Sample_tm[1:2], slot, "Alpha")
- Sapply (Sample_tm, function (x) mean (apply (posterior (x) $$$ $topics, 1, function (z)-sum (z * log (z)))))
Copy Code
<ignore_js_op>
The alpha estimate is significantly less than the default value, which indicates that the Dirichlet distribution data is concentrated in partial data, and the document includes some topics.
Higher values indicate more uniform topic distribution
- #最可能的主题文档
- Topic <-topics (sample_tm[["VEM"], 1)
- Table (Topic)
- #每个Topic前5个Term
- Terms <-Terms (sample_tm[["VEM"], 5)
- TERMS[,1:10]
Copy Code
<ignore_js_op>
- ######### number of topics in each article in auto
- (Topics_auto <-topics (sample_tm[["VEM"]) [grep ("Auto", Csv[[1]])
- Most_frequent_auto <-Which.max (tabulate (Topics_auto))
- ######### 10 words that are most relevant to auto themes
- Terms (sample_tm[["VEM"], ten) [, Most_frequent_auto]
Copy Code
<ignore_js_op>
R language-text mining topic Model Text classification