Borrowed from the Su Jianlin of the Great God's blog about the emotional analysis of the three article. And on this basis, the new word is added. Disable Word download link: Stop word
Code Environment:
python2.7
TENSORFLOW-GPU 1.0
Jieba
The accuracy rate after the test is as high as 98%, the result is as follows:
The code is as follows:
#-*-Coding:utf-8-*-' on GTX1070, 11s round after 30 rounds of iterations, the training set accuracy rate of 98.41% dropout can not be used too much, otherwise the information loss is too serious ' import NumPy as NP import PA NDAs as PD import Jieba pd.set_option (' Display.max_columns ', 8) pd.set_option (' Display.max_rows ',) pd.set_option (' Display.max_colwidth ', #不允许换行显示 pd.set_option (' Expand_frame_repr ', False) #读取stop停用词 stopwords= ' hit stop glossary. txt ' stop _single_words=[] with the open (Stopwords, ' R ') as F:for line in F:content=line.strip () stop_single_words.
Append (Content.decode (' GBK ')) print Stop_single_words #读取情感正与负样本.
pos = Pd.read_excel (' Pos.xls ', Header=none) pos[' label ' = 1 neg = pd.read_excel (' Neg.xls ', Header=none) neg[' label ' = 0 All_ = Pos.append (neg, ignore_index=true) all_[' words '] = all_[0].apply (lambda s: [I for I in List (Jieba.cut (s)) if I No T in Stop_single_words]) #调用结巴分词 print All_[:5] MaxLen = #截断词数 Min_count = 5 #出现次数少于该值的词扔掉. This is the simplest dimensionality reduction method content = [] for i in all_[' words ']: content.extend (i) ABC = PD. Series (content). Value_counts () ABC= Abc[abc >= Min_count] abc[:] = range (1, Len (ABC) +1) abc['] = 0 #添加空字符串用来补全 word_set = set (Abc.index) def doc2num (S,
MaxLen): s = [I for I in word_set] s = s[:maxlen] + [']*max (0, Maxlen-len (s)) return list (Abc[s]) all_[' doc2num '] = all_[' words '].apply (lambda s:doc2num (S, maxlen)) #手动打乱数据 idx = range (len (All_)) Np.random.shuffle (idx All_ = All_.loc[idx] #按keras的输入要求来生成数据 x = Np.array (List (all_[' Doc2num '))) y = Np.array (list (all_[' label ')) y = Y.resh Ape (( -1,1)) #调整标签形状 from keras.models Import sequential to keras.layers import dense, activation, dropout, embedding F Rom keras.layers import lstm #建立模型 model = sequential () Model.add (Embedding (LEN (ABC), 256, Input_length=maxlen)) model.ad D (lstm (128)) Model.add (Dropout (0.5)) Model.add (dense (1)) Model.add (Activation (' sigmoid ')) Model.compile (loss= ') Binary_crossentropy ', optimizer= ' Adam ', metrics=[' accuracy '] batch_size = 128 Train_num = 15 Model.fit (X[:train_num], Y[:traIn_num], batch_size = batch_size, nb_epoch=30) model.evaluate (x[train_num:], Y[train_num:], batch_size = batch_size) def Predict_one (s): #单个句子的预测函数 s = Np.array (Doc2num (List (Jieba.cut (s), maxlen)) s = S.reshape ((1, s.shape[0))) R Eturn model.predict_classes (S, verbose=0) [0][0]
This plan in the word, try Word2vec, but take into account participle, the relationship between the data becomes very fragile, did not try, if someone did, remember to inform Ha.