python資料分析學習筆記九

來源:互聯網
上載者:User

第九章 分析文本資料和社交媒體

1 安裝nltk 略

 

 

 

2 濾除停用字 姓名和數字

範例程式碼如下:

import nltk# 載入英語停用字語料sw = set(nltk.corpus.stopwords.words('english'))print('Stop words', list(sw)[:7])# 取得gutenberg語料庫中的部分檔案gb = nltk.corpus.gutenbergprint('Gutenberg files', gb.fileids()[-5:])# 取milton-paradise.txt檔案中的前兩句,作為下面所用的過濾語句text_sent = gb.sents("milton-paradise.txt")[:2]print('Unfiltered', text_sent)# 過濾停用字for sent in text_sent:    filtered = [w for w in sent if w.lower() not in sw]    print('Filtered', filtered)    # 取得文本內所含的標籤    tagged = nltk.pos_tag(filtered)    print("Tagged", tagged)    words = []    for word in tagged:        if word[1] != 'NNP' and word[1] != 'CD':            words.append(word[0])    print(words)# 詞性標註集# print(nltk.tag.tagset_mapping('ru-rnc', 'universal'))

 

運行結果如下:

 

Connected to pydev debugger (build162.1967.10)

Stop words ['his', 'only', 'because','with', 'each', 'myself', 'both']

Gutenberg files ['milton-paradise.txt','shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt','whitman-leaves.txt']

Unfiltered [['[', 'Paradise', 'Lost', 'by','John', 'Milton', '1667', ']'], ['Book', 'I']]

Filtered ['[', 'Paradise', 'Lost', 'John','Milton', '1667', ']']

Tagged [('[', 'JJ'), ('Paradise', 'NNP'),('Lost', 'NNP'), ('John', 'NNP'), ('Milton', 'NNP'), ('1667', 'CD'), (']','NN')]

['[', ']']

Filtered ['Book']

Tagged [('Book', 'NN')]

['Book']

 

本例用到的標記集:

 

{'PRP$',

'PDT',

'CD',

'EX',

 '.',

'NNS',

 'MD',

 'PRP',

 'RP',

'(',

'VBD',

'``',

"''",

'NN', 名詞

 'LS',

'VBN',

 'WRB',

'IN', 介詞

'FW',

 'POS',

'CC', 並連詞

':',

'DT',

 'VBZ',

'RBS',

'RBR',

'WP$',

'RB',

'SYM',

 'JJS',

 'JJR',

'UH',

'WDT',

'#',

',',

')',

'VB',

'NNPS',

 'VBP',  動詞

'NNP',

'JJ',  形容詞

'WP',

'VBG',

'$',

'TO'} 單詞to

 

 

 

 

 

 

 

 

 

 

 

 

粗略的分為以下12種類型

'VERB',
'NOUN',
'PRON',
'ADJ',
'ADV',
'ADP',
'CONJ',
'DET',
'NUM',
'PRT',
'X',
'.'

 

 

 

3 詞袋模型

安裝scikit-learn略

範例程式碼如下:

import nltkfrom sklearn.feature_extraction.text import CountVectorizer# 從gutenberg語料庫中載入以下兩個檔案gb = nltk.corpus.gutenberghamlet = gb.raw('shakespeare-hamlet.txt')macbeth = gb.raw("shakespeare-macbeth.txt")# 去掉英語停用詞cv = CountVectorizer(stop_words='english')# 輸出部分特徵值print("Feature vector", cv.fit_transform([hamlet, macbeth]).toarray())# 特徵值是按字母順序排序print('Features', cv.get_feature_names()[:5])

 

運行結果如下:

Feature vector [[ 1  0  1..., 14  0  1]

 [0  1 0 ...,  1  1  0]]

Features ['1599', '1603', 'abhominably','abhorred', 'abide']

 

4 詞頻分析

範例程式碼如下:

def printLine(values, num, keyOrValue, tag):    """    列印指定列表的num個元素的key或是value,輸出標籤為tag    :param values:列表    :param num: 輸出元素個數    :param keyOrValue: 輸出元素的鍵還是值 0表示鍵,1表示值    :param tag: 輸出標籤    :return:    """    tmpValue = []    for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]:        tmpValue.append(key[keyOrValue])    print(tag, ":", tmpValue)# 載入文檔gb = nltk.corpus.gutenbergwords = gb.words("shakespeare-caesar.txt")# 支除停用詞和標點符號sw = set(nltk.corpus.stopwords.words('english'))punctuation = set(string.punctuation)filtered = [w.lower() for w in words if w.lower() not in sw and w.lower() not in punctuation]# 建立freqDist對象,輸出頻率最高的鍵和值fd = nltk.FreqDist(filtered)printLine(fd, 5, 0, "Wrods")printLine(fd, 5, 1, "Counts")# 最常出現的單詞和次數print('Max', fd.max())print('Count', fd['caesar'])# 最常出現的雙字詞和詞數fd = nltk.FreqDist(nltk.bigrams(filtered))printLine(fd, 5, 0, "Bigrams")printLine(fd, 5, 1, "Counts")print('Bigram Max', fd.max())print('Bigram count', fd[('let', 'vs')])# 最常出現的三字詞和詞數fd = nltk.FreqDist(nltk.trigrams(filtered))printLine(fd, 5, 0, "Trigrams")printLine(fd, 5, 1, "Counts")print('Bigram Max', fd.max())print('Bigram count', fd[('enter', 'lucius', 'luc')])

 

運行結果如下:

Wrods : ['caesar', 'brutus', 'bru', 'haue','shall']

Counts : [190, 161, 153, 148, 125]

Max caesar

Count 190

 

Bigrams : [('let', 'vs'), ('wee', 'l'),('mark', 'antony'), ('marke', 'antony'), ('st', 'thou')]

Counts : [16, 15, 13, 12, 12]

Bigram Max ('let', 'vs')

Bigram count 16

 

Trigrams : [('enter', 'lucius', 'luc'),('wee', 'l', 'heare'), ('thee', 'thou', 'st'), ('beware', 'ides', 'march'),('let', 'vs', 'heare')]

Counts : [4, 4, 3, 3, 3]

Bigram Max ('enter', 'lucius', 'luc')

Bigram count 4

 

 

5 樸素貝葉斯分類

是一個機率演算法,基於機率和數理統計中的貝葉斯定理

範例程式碼如下:

import nltkimport stringimport random# 停用詞和標點符號集合sw = set(nltk.corpus.stopwords.words('english'))punctuation = set(string.punctuation)# 將字長作為一個特徵def word_features(word):    return {'len': len(word)}# 是否為停用詞或是標點符號def isStopword(word):    return word in sw or word in punctuation# 負載檔案gb = nltk.corpus.gutenbergwords = gb.words("shakespeare-caesar.txt")# 對單詞進行標註,區分是否為停用詞labeled_words = ([(word.lower(), isStopword(word.lower())) for word in words])random.seed(42)random.shuffle(labeled_words)print(labeled_words[:5])# 求出每個單詞的長度,作為特徵值featuresets = [(word_features(n), word) for (n, word) in labeled_words]# 訓練一個樸素貝葉斯分類器cutoff = int(.9 * len(featuresets))# 建立訓練資料集和測試資料集train_set, test_set = featuresets[:cutoff], featuresets[cutoff:]# 檢查分類器效果classifier = nltk.NaiveBayesClassifier.train(train_set)print("'behold' class", classifier.classify(word_features('behold')))print("'the' class", classifier.classify(word_features('the')))# 根據測試資料集來計算分類器的準確性print("Accuracy", nltk.classify.accuracy(classifier, test_set))# 貢獻度最大的特徵print(classifier.show_most_informative_features(5))

 

運行結果如下:

[('i', True), ('is', True), ('in', True),('he', True), ('ambitious', False)]

 

'behold' class False

'the' class True

 

Accuracy 0.8521671826625387

 

Most Informative Features

                     len = 7               False : True   =    77.8 : 1.0

                     len = 6               False : True   =    52.2 : 1.0

                     len = 1                True : False  =    51.8 : 1.0

                     len = 2                True : False  =    10.9 : 1.0

                     len = 5               False : True   =    10.9 : 1.0

None

 

6 情感分析

範例程式碼如下:

import randomfrom nltk.corpus import movie_reviewsfrom nltk.corpus import stopwordsfrom nltk import FreqDistfrom nltk import NaiveBayesClassifierfrom nltk.classify import accuracyimport stringdef getElementsByNum(values, num, keyOrValue):    """    取得指定列表的num個元素的key或是value,    :param values:列表    :param num: 元素個數    :param keyOrValue: 元素的鍵還是值 0表示鍵,1表示值    :return:    """    tmpValue = []    for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]:        tmpValue.append(key[keyOrValue])    return tmpValue# 載入資料labeled_docs = [(

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.