NLP-python natural language processing 01,
1 #-*-coding: UTF-8-*-2 "3 Created on Wed Sep 6 22:21:09 2017 4 5 @ author: Administrator 6" 7 import nltk 8 from nltk. book import * 9 # search for words 10 text1.concordance ("monstrous") # search for keywords 11 12 # search for similar words 13 text1.similar ('monstrous ') 14 15 # search for common context 16 text2.common _ contexts (['monstrous', 'very']) 17 18 19 # vocabulary distribution 20 text4.dispersion _ plot (['moustrous ', 'very']) 21 22 # vocabulary length 23 len (text3) 24 25 # repeated word density 26 len (text3)/len (set (text3 )) 27 28 # keyword density 29 text3.count ('smote') 30 100 * text4.count ('A')/len (text4) 31 32 def lexical_diversity (text): 33 return len (text) /len (set (text) 34 35 def percentage (count, total): 36 return 100 * count/total37 38 39 40 sent1 = ['call', 'me ', 'ishmael ','. '] 41 42 # retrieve text word indexes. Pay attention to the index length, starting from. 43. text3 [172] 44 45. text3.index ('love') 46. 47 # frequency distribution, determination of commonly used words 48 # simple statistics, frequency distribution 49 fdist1 = FreqDist (text1) 50 51 vocabulary1 = fdist1.keys () 52 fdist1 ['whale '] 53 fdist1.plot (50, cumulative = True) 54 55 # Low Frequency Word 56 fdist1.hapaxes () 57 58 # fine-grained word selection 59 V = set (text1) 60 long_words = [w for w in V if len (w)> 15] 61 sorted (long_words) 62 63 # The length of the word frequency is determined by 64 fdist5 = FreqDist (text5) 65 sorted ([w for w in set (text5) if len (w)> 7 and fdist5 [w]> 7]) 66 67 # commonly used word combinations, dual words with 68 from nltk. util import bigrams69 list (bigrams (['more', 'is', 'said', 'thone', 'done']) 70 71 72 # common binary words with 73 text4.collocations () 74 75 # length of each word in the text 76 [len (w) for w in text1] 77 78 # distribution of each length word, the output is a dictionary 79 fdist = FreqDist ([len (w) for w in text1]) 80 81 fdist. keys () # index value 82 fdist. items () 83 fdist. max () # Index 84 85 fdist [3] # position where the index value is 3