"""
Code to accompany the chapter "natural language corpus data"
From the book "beautiful data" (segaran and hammerbacher, 2009)
Http://oreilly.com/catalog/9780596157111/
Code copyright (c) 2008-2009 by Peter norvig
You are free to use this code under the MIT licencse:
Http://www.opensource.org/licenses/mit-license.php
"""
Import re, String, random, glob, operator, heapq
From collections import defaultdict
From math import log10
Def memo (f ):
"Memoize function f ."
Table = {}
Def fmemo (* ARGs ):
If ARGs not in table:
Table [ARGs] = f (* ARGs)
Return table [ARGs]
Fmemo. Memo = table
Return fmemo
Def test (verbose = none ):
"Run some tests, taken from the chapter.
Since the hillclimbing algorithm is randomized, some tests may fail ."""
Import doctest
Print 'running tests ...'
Doctest.testfile('ngrams-test.txt ', verbose = verbose)
############### Word segmentation (p. 223)
@ Memo
Def segment (text ):
"Return a list of words that is the best segmentation of text ."
If not text: return []
Candidates = ([first] + segment (REM) for first, REM in Splits (text ))
Return max (candidates, key = pwords)
Def splits (text, L = 20 ):
"Return a list of all possible (first, REM) pairs, Len (first) <= L ."
Return [(Text [: I + 1], text [I + 1:])
For I in range (min (LEN (text), L)]
Def pwords (words ):
"The Naive Bayes Probability of a sequence of words ."
Return product (PW (w) for W in words)
#### Support functions (p. 224)
Def product (Nums ):
"Return the product of a sequence of numbers ."
Return reduce (operator. Mul, Nums, 1)
class pdist (dict):
"a probability distribution estimated from counts in datafile. "
def _ init _ (self, Data = [], n = none, missingfn = none):
for key, count in data:
Self [Key] = self. get (Key, 0) + int (count)
self. N = float (N or sum (self. itervalues ()
self. missingfn = missingfn or (lambda k, n: 1. /n)
def _ call _ (self, key):
If key in self: return self [Key]/self. n
else: return self. missingfn (Key, self. n)
Def datafile (name, SEP = '\ t '):
"Read key, value pairs from file ."
For line in file (name ):
Yield Line. Split (SEP)
Def avoid_long_words (Key, n ):
"Estimate the probability of an unknown word ."
Return 10./(n * 10 ** Len (key ))
N = 1024908267229 # Number of tokens
PW = pdist(datafile('count_11_txt '), N, avoid_long_words)
#### Segment2: second version, with bigram counts, (p. 226-227)
Def CPW (word, Prev ):
"Conditional probability of word, given previous word ."
Try:
Return P2w [Prev + ''+ word]/float (PW [Prev])
Failed t keyerror:
Return Pw (word)
P2w = pdist(datafile('count_20000txt '), n)
@ Memo
Def segment2 (text, Prev = '<S> '):
"Return (log P (words), words), where words is the best segmentation ."
If not text: Return 0.0, []
Candidates = [combine (log10 (CPW (first, Prev), first, segment2 (REM, first ))
For first, REM in Splits (text)]
Return max (candidates)
Def combine (pfirst, first, (Prem, REM )):
"Combine first and REM results into one (probability, words) pair ."
Return pfirst + Prem, [first] + REM
############### Secret codes (p. 228-230)
Def encode (MSG, key ):
"Encode a message with a substitution cipher ."
Return MSG. Translate (string. maketrans (UL (alphabet), UL (key )))
Def ul (text): return text. Upper () + text. Lower ()
Alphabet = 'abcdefghijklmnopqrstuvwxy'
Def shift (MSG, n = 13 ):
"Encode a message with a shift (CAESAR) cipher ."
Return encode (MSG, alphabet [N:] + alphabet [: N])
Def logpwords (words ):
"The Naive Bayes Probability of a string or sequence of words ."
If isinstance (words, STR): words = allwords (words)
Return sum (log10 (PW (w) for W in words)
Def allwords (text ):
"Return a list of alphabetic words in text, lowercase ."
Return re. findall ('[A-Z] +', text. Lower ())
Def decode_shift (MSG ):
"Find the best decoding of a message encoded with a shift cipher ."
Candidates = [shift (MSG, n) for N in range (LEN (alphabet)]
Return max (candidates, key = logpwords)
Def shift2 (MSG, n = 13 ):
"Encode with a shift (CAESAR) cipher, yielding only letters [A-Z]."
Return shift (just_letters (MSG), n)
Def just_letters (text ):
"Lowercase text and remove all characters character T [A-Z]."
Return re. sub ('[^ A-Z]', '', text. Lower ())
Def decode_shift2 (MSG ):
"Decode a message encoded with a shift cipher, with no spaces ."
Candidates = [segment2 (shift (MSG, n) for N in range (LEN (alphabet)]
P, words = max (candidates)
Return ''. Join (words)
#### General substitution cipher (p.m. 231-233)
Def logp3letters (text ):
"The log-probability of text using a letter 3-gram model ."
Return sum (log10 (p3l (G) for G in ngrams (text, 3 ))
Def ngrams (SEQ, n ):
"List all the (OVERLAPPING) ngrams in a sequence ."
Return [seq [I: I + N] For I in range (1 + Len (SEQ)-N)]
P3l = pdist(datafile('count_3l.txt '))
P2l = pdist(datafile('count_2l.txt ') # We'll need it later
Def hillclimb (x, F, neighbors, steps = 10000 ):
"Search for an X that miximizes f (x), considering neighbors (x )."
FX = f (x)
Neighborhood = ITER (neighbors (x ))
For I in range (Steps ):
X2 = neighborhood. Next ()
Fx2 = f (X2)
If fx2> FX:
X, FX = x2, fx2
Neighborhood = ITER (neighbors (x ))
If debugging: Print 'hillclimb: ', X, INT (FX)
Return x
Debugging = false
Def decode_subst (MSG, steps = 4000, restarts = 90 ):
"Decode a substitution cipher with random restart hillclimbing ."
MSG = CAT (allwords (MSG ))
Candidates = [hillclimb (encode (MSG, key = CAT (shuffled (alphabet ))),
Logp3letters, neighboring_msgs, steps)
For _ in range (restarts)]
P, words = max (segment2 (c) for C in candidates)
Return ''. Join (words)
Def shuffled (SEQ ):
"Return a randomly shuffled copy of the input sequence ."
SEQ = List (SEQ)
Random. Shuffle (SEQ)
Return seq
Cat = ''. Join
Def neighboring_msgs (MSG ):
"Generate nearby keys, hopefully better ones ."
Def swap (A, B): Return MSG. Translate (string. maketrans (A + B, B + ))
For bigram in heapq. nsmallest (20, set (ngrams (MSG, 2), p2l ):
B1, b2 = bigram
For C in alphabet:
If b1 = B2:
If p2l (C + C)> p2l (bigram): yield swap (C, B1)
Else:
If p2l (C + b2)> p2l (bigram): yield swap (C, B1)
If p2l (b1 + C)> p2l (bigram): yield swap (C, B2)
While true:
Yield swap (random. Choice (alphabet), random. Choice (alphabet ))
############### Spelling correction (p. 236 -)
Def corrections (text ):
"Spell-correct all words in text ."
Return re. sub ('[A-Za-Z] +', Lambda M: Correct (M. group (0), text)
Def correct (W ):
"Return the word that is the most likely spell correction of W ."
Candidates = edits (W). Items ()
C, edit = max (candidates, key = Lambda (C, E): pedit (e) * Pw (c ))
Return C
Def pedit (edit ):
"The probability of an edit; can be'' or 'a | B 'or 'a | B + C | D '."
If edit = '': Return (1.-p_spell_error)
Return p_spell_error * product (p1edit (e) For E in edit. Split ('+ '))
P_spell_error = 1./20.
P1edit = pdist(datafile('count_1edit.txt ') # probabilities of single edits
Def edits (word, D = 2 ):
"Return a dict of {correct: Edit} pairs within D edits of word ."
Results = {}
Def editsr (HD, Tl, D, edits ):
Def ed (L, R): Return edits + [R + '|' + L]
C = HD + tl
If C in PW:
E = '+'. Join (edits)
If C not in results: Results [c] = E
Else: Results [c] = max (results [c], E, key = pedit)
If D <= 0: Return
Extensions = [hd + C for C in Alphabet if HD + C in prefixes]
P = (HD [-1] If HD else '<') # previous character
# Insertion
For H in extensions:
Editsr (H, Tl, D-1, Ed (p + H [-1], p ))
If not TL: Return
# Deletion
Editsr (HD, Tl [1:], D-1, Ed (p, p + TL [0])
For H in extensions:
If H [-1] = TL [0]: # match
Editsr (H, Tl [1:], D, edits)
Else: ## replacement
Editsr (H, Tl [1:], D-1, Ed (H [-1], Tl [0])
# Transpose
If Len (TL)> = 2 and TL [0]! = TL [1] and HD + TL [1] In prefixes:
Editsr (HD + TL [1], Tl [0] + TL [2:], D-1,
Ed (TL [1] + TL [0], Tl [0: 2])
# Body of edits:
Editsr ('', word, D, [])
Return results
Prefixes = set (W [: I] for W in Pw for I in range (LEN (w) + 1 ))