第九章 分析文本資料和社交媒體
1 安裝nltk 略
2 濾除停用字 姓名和數字
範例程式碼如下:
import nltk# 載入英語停用字語料sw = set(nltk.corpus.stopwords.words('english'))print('Stop words', list(sw)[:7])# 取得gutenberg語料庫中的部分檔案gb = nltk.corpus.gutenbergprint('Gutenberg files', gb.fileids()[-5:])# 取milton-paradise.txt檔案中的前兩句,作為下面所用的過濾語句text_sent = gb.sents("milton-paradise.txt")[:2]print('Unfiltered', text_sent)# 過濾停用字for sent in text_sent: filtered = [w for w in sent if w.lower() not in sw] print('Filtered', filtered) # 取得文本內所含的標籤 tagged = nltk.pos_tag(filtered) print("Tagged", tagged) words = [] for word in tagged: if word[1] != 'NNP' and word[1] != 'CD': words.append(word[0]) print(words)# 詞性標註集# print(nltk.tag.tagset_mapping('ru-rnc', 'universal'))
運行結果如下:
Connected to pydev debugger (build162.1967.10)
Stop words ['his', 'only', 'because','with', 'each', 'myself', 'both']
Gutenberg files ['milton-paradise.txt','shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt','whitman-leaves.txt']
Unfiltered [['[', 'Paradise', 'Lost', 'by','John', 'Milton', '1667', ']'], ['Book', 'I']]
Filtered ['[', 'Paradise', 'Lost', 'John','Milton', '1667', ']']
Tagged [('[', 'JJ'), ('Paradise', 'NNP'),('Lost', 'NNP'), ('John', 'NNP'), ('Milton', 'NNP'), ('1667', 'CD'), (']','NN')]
['[', ']']
Filtered ['Book']
Tagged [('Book', 'NN')]
['Book']
本例用到的標記集:
{'PRP$',
'PDT',
'CD',
'EX',
'.',
'NNS',
'MD',
'PRP',
'RP',
'(',
'VBD',
'``',
"''",
'NN', 名詞
'LS',
'VBN',
'WRB',
'IN', 介詞
'FW',
'POS',
'CC', 並連詞
':',
'DT',
'VBZ',
'RBS',
'RBR',
'WP$',
'RB',
'SYM',
'JJS',
'JJR',
'UH',
'WDT',
'#',
',',
')',
'VB',
'NNPS',
'VBP', 動詞
'NNP',
'JJ', 形容詞
'WP',
'VBG',
'$',
'TO'} 單詞to
粗略的分為以下12種類型
'VERB',
'NOUN',
'PRON',
'ADJ',
'ADV',
'ADP',
'CONJ',
'DET',
'NUM',
'PRT',
'X',
'.'
3 詞袋模型
安裝scikit-learn略
範例程式碼如下:
import nltkfrom sklearn.feature_extraction.text import CountVectorizer# 從gutenberg語料庫中載入以下兩個檔案gb = nltk.corpus.gutenberghamlet = gb.raw('shakespeare-hamlet.txt')macbeth = gb.raw("shakespeare-macbeth.txt")# 去掉英語停用詞cv = CountVectorizer(stop_words='english')# 輸出部分特徵值print("Feature vector", cv.fit_transform([hamlet, macbeth]).toarray())# 特徵值是按字母順序排序print('Features', cv.get_feature_names()[:5])
運行結果如下:
Feature vector [[ 1 0 1..., 14 0 1]
[0 1 0 ..., 1 1 0]]
Features ['1599', '1603', 'abhominably','abhorred', 'abide']
4 詞頻分析
範例程式碼如下:
def printLine(values, num, keyOrValue, tag): """ 列印指定列表的num個元素的key或是value,輸出標籤為tag :param values:列表 :param num: 輸出元素個數 :param keyOrValue: 輸出元素的鍵還是值 0表示鍵,1表示值 :param tag: 輸出標籤 :return: """ tmpValue = [] for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]: tmpValue.append(key[keyOrValue]) print(tag, ":", tmpValue)# 載入文檔gb = nltk.corpus.gutenbergwords = gb.words("shakespeare-caesar.txt")# 支除停用詞和標點符號sw = set(nltk.corpus.stopwords.words('english'))punctuation = set(string.punctuation)filtered = [w.lower() for w in words if w.lower() not in sw and w.lower() not in punctuation]# 建立freqDist對象,輸出頻率最高的鍵和值fd = nltk.FreqDist(filtered)printLine(fd, 5, 0, "Wrods")printLine(fd, 5, 1, "Counts")# 最常出現的單詞和次數print('Max', fd.max())print('Count', fd['caesar'])# 最常出現的雙字詞和詞數fd = nltk.FreqDist(nltk.bigrams(filtered))printLine(fd, 5, 0, "Bigrams")printLine(fd, 5, 1, "Counts")print('Bigram Max', fd.max())print('Bigram count', fd[('let', 'vs')])# 最常出現的三字詞和詞數fd = nltk.FreqDist(nltk.trigrams(filtered))printLine(fd, 5, 0, "Trigrams")printLine(fd, 5, 1, "Counts")print('Bigram Max', fd.max())print('Bigram count', fd[('enter', 'lucius', 'luc')])
運行結果如下:
Wrods : ['caesar', 'brutus', 'bru', 'haue','shall']
Counts : [190, 161, 153, 148, 125]
Max caesar
Count 190
Bigrams : [('let', 'vs'), ('wee', 'l'),('mark', 'antony'), ('marke', 'antony'), ('st', 'thou')]
Counts : [16, 15, 13, 12, 12]
Bigram Max ('let', 'vs')
Bigram count 16
Trigrams : [('enter', 'lucius', 'luc'),('wee', 'l', 'heare'), ('thee', 'thou', 'st'), ('beware', 'ides', 'march'),('let', 'vs', 'heare')]
Counts : [4, 4, 3, 3, 3]
Bigram Max ('enter', 'lucius', 'luc')
Bigram count 4
5 樸素貝葉斯分類
是一個機率演算法,基於機率和數理統計中的貝葉斯定理
範例程式碼如下:
import nltkimport stringimport random# 停用詞和標點符號集合sw = set(nltk.corpus.stopwords.words('english'))punctuation = set(string.punctuation)# 將字長作為一個特徵def word_features(word): return {'len': len(word)}# 是否為停用詞或是標點符號def isStopword(word): return word in sw or word in punctuation# 負載檔案gb = nltk.corpus.gutenbergwords = gb.words("shakespeare-caesar.txt")# 對單詞進行標註,區分是否為停用詞labeled_words = ([(word.lower(), isStopword(word.lower())) for word in words])random.seed(42)random.shuffle(labeled_words)print(labeled_words[:5])# 求出每個單詞的長度,作為特徵值featuresets = [(word_features(n), word) for (n, word) in labeled_words]# 訓練一個樸素貝葉斯分類器cutoff = int(.9 * len(featuresets))# 建立訓練資料集和測試資料集train_set, test_set = featuresets[:cutoff], featuresets[cutoff:]# 檢查分類器效果classifier = nltk.NaiveBayesClassifier.train(train_set)print("'behold' class", classifier.classify(word_features('behold')))print("'the' class", classifier.classify(word_features('the')))# 根據測試資料集來計算分類器的準確性print("Accuracy", nltk.classify.accuracy(classifier, test_set))# 貢獻度最大的特徵print(classifier.show_most_informative_features(5))
運行結果如下:
[('i', True), ('is', True), ('in', True),('he', True), ('ambitious', False)]
'behold' class False
'the' class True
Accuracy 0.8521671826625387
Most Informative Features
len = 7 False : True = 77.8 : 1.0
len = 6 False : True = 52.2 : 1.0
len = 1 True : False = 51.8 : 1.0
len = 2 True : False = 10.9 : 1.0
len = 5 False : True = 10.9 : 1.0
None
6 情感分析
範例程式碼如下:
import randomfrom nltk.corpus import movie_reviewsfrom nltk.corpus import stopwordsfrom nltk import FreqDistfrom nltk import NaiveBayesClassifierfrom nltk.classify import accuracyimport stringdef getElementsByNum(values, num, keyOrValue): """ 取得指定列表的num個元素的key或是value, :param values:列表 :param num: 元素個數 :param keyOrValue: 元素的鍵還是值 0表示鍵,1表示值 :return: """ tmpValue = [] for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]: tmpValue.append(key[keyOrValue]) return tmpValue# 載入資料labeled_docs = [(