Python3. Basic knowledge of X natural language processing
Last Update:2018-07-24
Source: Internet
Author: User
Import NLTK
Nltk.download () #下载nltk语料库如果没有安装nltk, please use the batch pip install NLTK to install under CMD
From Nltk.book Import *
# # #搜索文本
#搜索单词
Text1.concordance ("monstrous")
Text2.concordance ("affection")
Text3.concordance ("lived")
Text5.concordance ("LOL")
#搜索相似词
Text1.similar ("monstrous")
Text2.similar ("monstrous")
#搜索共同上下文
Text2.common_contexts (["Monstrous", "very"])
#词汇分布图
Text4.dispersion_plot (["Citizens", "democracy", "freedom", "duties", "America"])
# # #计数词汇
Len (TEXT3)
Sorted (Set (TEXT3))
Len (Set (TEXT3))
#重复词密度
From __future__ Import Division
Len (TEXT3)/Len (set (TEXT3))
#关键词密度
Text3.count ("Smote")
Text4.count (' a ')/Len (TEXT4)
def lexical_diversity (text):
Return len (text)/Len (set (text))
def percentage (count, total):
return * Count/total
Lexical_diversity (TEXT3)
Lexical_diversity (TEXT5)
Percentage (4, 5)
Percentage (Text4.count (' a '), Len (TEXT4))
# # #词链表
Sent1 = [' Call ', ' Me ', ' Ishmael ', '. ']
Sent1
Len (sent1)
Lexical_diversity (SENT1)
Print (SENT2)
Print (SENT3)
#连接
Sent4+sent1
#追加
Sent1.append ("some")
Print (SENT1)
#索引
TEXT4[173]
Text4.index (' Awaken ')
#切片
Print (text5[16715:16735])
Print (text6[1600:1625])
#索引从0开始, be aware
Sent = [' word1 ', ' word2 ', ' word3 ', ' word4 ', ' word5 ', ' Word6 ', ' Word7 ', ' Word8 ', ' word9 ', ' word10 ']
Print (Sent[0])
Print (sent[9])
Print (sent[10])
Print (Sent[5:8])
Print (Sent[5])
Print (Sent[6])
Print (Sent[7])
Print (Sent[:3])
Print (text2[141525:])
Sent[0] = ' a '
SENT[9] = ' last '
Sent[1:9] = [' Second ', ' third ']
Print (Sent)
SENT[9]
# # #简单统计
#频率分布
Fdist1 = Freqdist (Text1)
Fdist1
Vocabulary1 = List (Fdist1.keys ())
VOCABULARY1[:50]
fdist1[' Whale ']
Fdist1.plot (M, cumulative=true)
Fdist1.hapaxes () #寻找只出现过一次的词
#细粒度的选择词
V = Set (TEXT4)
Long_words = [w for W in V If Len (w) > 15]
Sorted (long_words)
V = Set (TEXT5)
Long_words = [w for W in V If Len (w) > 15]
Sorted (long_words)
FDIST5 = Freqdist (TEXT5)
Sorted ([w for W in Set (TEXT5) If Len (W) > 7 and Fdist5[w] > 7])
#词语搭配
From Nltk.util import Bigrams
List (bigrams [' More ', ' are ', ' said ', ' than ', ' done '])
Text4.collocations ()
Text8.collocations ()
# # #其他统计结果
[Len (W) for W in Text1]
fdist = Freqdist ([Len (W) for W in Text1])
Fdist
Fdist.keys ()
Fdist.items ()
Fdist.max ()
FDIST[3]
Fdist.freq (3)