#写在前面的话: This tutorial mainly uses R to connect the DB2 database, and carries on the text analysis, has produced the word graph
#教程为markdown编写
---
Title: "Website Message analysis"
Output:html_document
---
"' {R setup, include=false}
Knitr::opts_chunk$set (echo=false,error=false,warning=false,message = f,comment= "")
```
"' {R} '
#连接数据库并读取数据
Library (RODBC)
Options (scipen=200)
conn = Odbcconnect (dsn= "DB2", uid= "Tjfx", "1")
Myfile<-sqlquery (conn, "select * from T_from_last_year_liuyan")
myfile<-myfile[,4]
#head (myfile)
#调入分词的库
Library ("Rjava")
Library ("Rwordseg")
#调入绘制词云的库
Library ("Rcolorbrewer")
Library ("Wordcloud")
#预处理, this step can convert the text read into a word that can be participle, no this step cannot be participle
Myfile.res <-myfile[myfile!= ""]
```
"' {r,echo=f,error=false,warning=false,message = f,comment=" ", results= ' hide '}
#载入语料库, after you have done the thesaurus, you can load the memory with the following statement:
Installdict ("Ciku.txt", Dictname = "My Dict")
```
"' {R} '
#查看装载的词典:
#listDict ()
#删除安装的词典:
#as. Charecter
Myfile.res<-as.character (Myfile.res)
#分词 and convert the result of the participle to a vector
Myfile.words <-unlist (lapply (X = Myfile.res,fun = SEGMENTCN))
#剔除URL等各种不需要的字符, you also need to remove any special characters you can leaf out to add gsub statements below
Myfile.words <-gsub (pattern= "http:[a-za-z\\/\\.0-9]+", "", Myfile.words)
Myfile.words <-gsub ("\ n", "", Myfile.words)
Myfile.words <-gsub ("", "", Myfile.words)
#去掉停用词
Data_stw=read.table (file= "Mystopword.txt", colclasses= "character")
Stopwords_cn=c (NULL)
For (I-in 1:dim (DATA_STW) [1]) {
Stopwords_cn=c (stopwords_cn,data_stw[i,1])
}
For (J in 1:length (STOPWORDS_CN)) {
Myfile.words <-subset (Myfile.words,myfile.words!=stopwords_cn[j])
}
#过滤掉1个字的词
Myfile.words <-subset (myfile.words, nchar (As.character (myfile.words)) >1)
#统计词频
Myfile.freq <-Table (unlist (myfile.words))
Myfile.freq <-Rev (sort (myfile.freq))
Myfile.freq <-data.frame (Word=names (myfile.freq), freq=myfile.freq)
#按词频过滤词, filter out only once the word, here can adjust the frequency of the filter according to the need
Myfile.freq2=subset (Myfile.freq, myfile.freq$freq>=5)
#绘制词云
#设置一个颜色系:
MyColors <-Brewer.pal (8, "DARK2")
#设置字体
Windowsfonts (Myfont=windowsfont ("Microsoft Jas Black"))
#画图
Set.seed (123)
Wordcloud (Myfile.freq2$word,myfile.freq2$freq,random.order=false,
Random.color=false,colors=mycolors,family= "MyFont")
Write.table (myfile.freq2,row.names=f,quote = FALSE)
#结果图:
The output Word frequency document deleted the line number, see the code:
R connect the DB2 database and make a word map