Ngrams-Naive Bayes method word segmentation-Python

Source: Internet
Author: User

"""
Code to accompany the chapter "natural language corpus data"
From the book "beautiful data" (segaran and hammerbacher, 2009)
Http://oreilly.com/catalog/9780596157111/

Code copyright (c) 2008-2009 by Peter norvig

You are free to use this code under the MIT licencse:
Http://www.opensource.org/licenses/mit-license.php
"""

Import re, String, random, glob, operator, heapq
From collections import defaultdict
From math import log10

Def memo (f ):
"Memoize function f ."
Table = {}
Def fmemo (* ARGs ):
If ARGs not in table:
Table [ARGs] = f (* ARGs)
Return table [ARGs]
Fmemo. Memo = table
Return fmemo

Def test (verbose = none ):
"Run some tests, taken from the chapter.
Since the hillclimbing algorithm is randomized, some tests may fail ."""
Import doctest
Print 'running tests ...'
Doctest.testfile('ngrams-test.txt ', verbose = verbose)

############### Word segmentation (p. 223)

@ Memo
Def segment (text ):
"Return a list of words that is the best segmentation of text ."
If not text: return []
Candidates = ([first] + segment (REM) for first, REM in Splits (text ))
Return max (candidates, key = pwords)

Def splits (text, L = 20 ):
"Return a list of all possible (first, REM) pairs, Len (first) <= L ."
Return [(Text [: I + 1], text [I + 1:])
For I in range (min (LEN (text), L)]

Def pwords (words ):
"The Naive Bayes Probability of a sequence of words ."
Return product (PW (w) for W in words)

#### Support functions (p. 224)

Def product (Nums ):
"Return the product of a sequence of numbers ."
Return reduce (operator. Mul, Nums, 1)

class pdist (dict):
"a probability distribution estimated from counts in datafile. "
def _ init _ (self, Data = [], n = none, missingfn = none):
for key, count in data:
Self [Key] = self. get (Key, 0) + int (count)
self. N = float (N or sum (self. itervalues ()
self. missingfn = missingfn or (lambda k, n: 1. /n)
def _ call _ (self, key):
If key in self: return self [Key]/self. n
else: return self. missingfn (Key, self. n)

Def datafile (name, SEP = '\ t '):
"Read key, value pairs from file ."
For line in file (name ):
Yield Line. Split (SEP)

Def avoid_long_words (Key, n ):
"Estimate the probability of an unknown word ."
Return 10./(n * 10 ** Len (key ))

N = 1024908267229 # Number of tokens

PW = pdist(datafile('count_11_txt '), N, avoid_long_words)

#### Segment2: second version, with bigram counts, (p. 226-227)

Def CPW (word, Prev ):
"Conditional probability of word, given previous word ."
Try:
Return P2w [Prev + ''+ word]/float (PW [Prev])
Failed t keyerror:
Return Pw (word)

P2w = pdist(datafile('count_20000txt '), n)

@ Memo
Def segment2 (text, Prev = '<S> '):
"Return (log P (words), words), where words is the best segmentation ."
If not text: Return 0.0, []
Candidates = [combine (log10 (CPW (first, Prev), first, segment2 (REM, first ))
For first, REM in Splits (text)]
Return max (candidates)

Def combine (pfirst, first, (Prem, REM )):
"Combine first and REM results into one (probability, words) pair ."
Return pfirst + Prem, [first] + REM

############### Secret codes (p. 228-230)

Def encode (MSG, key ):
"Encode a message with a substitution cipher ."
Return MSG. Translate (string. maketrans (UL (alphabet), UL (key )))

Def ul (text): return text. Upper () + text. Lower ()

Alphabet = 'abcdefghijklmnopqrstuvwxy'

Def shift (MSG, n = 13 ):
"Encode a message with a shift (CAESAR) cipher ."
Return encode (MSG, alphabet [N:] + alphabet [: N])

Def logpwords (words ):
"The Naive Bayes Probability of a string or sequence of words ."
If isinstance (words, STR): words = allwords (words)
Return sum (log10 (PW (w) for W in words)

Def allwords (text ):
"Return a list of alphabetic words in text, lowercase ."
Return re. findall ('[A-Z] +', text. Lower ())

Def decode_shift (MSG ):
"Find the best decoding of a message encoded with a shift cipher ."
Candidates = [shift (MSG, n) for N in range (LEN (alphabet)]
Return max (candidates, key = logpwords)

Def shift2 (MSG, n = 13 ):
"Encode with a shift (CAESAR) cipher, yielding only letters [A-Z]."
Return shift (just_letters (MSG), n)

Def just_letters (text ):
"Lowercase text and remove all characters character T [A-Z]."
Return re. sub ('[^ A-Z]', '', text. Lower ())

Def decode_shift2 (MSG ):
"Decode a message encoded with a shift cipher, with no spaces ."
Candidates = [segment2 (shift (MSG, n) for N in range (LEN (alphabet)]
P, words = max (candidates)
Return ''. Join (words)

#### General substitution cipher (p.m. 231-233)

Def logp3letters (text ):
"The log-probability of text using a letter 3-gram model ."
Return sum (log10 (p3l (G) for G in ngrams (text, 3 ))

Def ngrams (SEQ, n ):
"List all the (OVERLAPPING) ngrams in a sequence ."
Return [seq [I: I + N] For I in range (1 + Len (SEQ)-N)]

P3l = pdist(datafile('count_3l.txt '))
P2l = pdist(datafile('count_2l.txt ') # We'll need it later

Def hillclimb (x, F, neighbors, steps = 10000 ):
"Search for an X that miximizes f (x), considering neighbors (x )."
FX = f (x)
Neighborhood = ITER (neighbors (x ))
For I in range (Steps ):
X2 = neighborhood. Next ()
Fx2 = f (X2)
If fx2> FX:
X, FX = x2, fx2
Neighborhood = ITER (neighbors (x ))
If debugging: Print 'hillclimb: ', X, INT (FX)
Return x

Debugging = false

Def decode_subst (MSG, steps = 4000, restarts = 90 ):
"Decode a substitution cipher with random restart hillclimbing ."
MSG = CAT (allwords (MSG ))
Candidates = [hillclimb (encode (MSG, key = CAT (shuffled (alphabet ))),
Logp3letters, neighboring_msgs, steps)
For _ in range (restarts)]
P, words = max (segment2 (c) for C in candidates)
Return ''. Join (words)

Def shuffled (SEQ ):
"Return a randomly shuffled copy of the input sequence ."
SEQ = List (SEQ)
Random. Shuffle (SEQ)
Return seq

Cat = ''. Join

Def neighboring_msgs (MSG ):
"Generate nearby keys, hopefully better ones ."
Def swap (A, B): Return MSG. Translate (string. maketrans (A + B, B + ))
For bigram in heapq. nsmallest (20, set (ngrams (MSG, 2), p2l ):
B1, b2 = bigram
For C in alphabet:
If b1 = B2:
If p2l (C + C)> p2l (bigram): yield swap (C, B1)
Else:
If p2l (C + b2)> p2l (bigram): yield swap (C, B1)
If p2l (b1 + C)> p2l (bigram): yield swap (C, B2)
While true:
Yield swap (random. Choice (alphabet), random. Choice (alphabet ))

############### Spelling correction (p. 236 -)

Def corrections (text ):
"Spell-correct all words in text ."
Return re. sub ('[A-Za-Z] +', Lambda M: Correct (M. group (0), text)

Def correct (W ):
"Return the word that is the most likely spell correction of W ."
Candidates = edits (W). Items ()
C, edit = max (candidates, key = Lambda (C, E): pedit (e) * Pw (c ))
Return C

Def pedit (edit ):
"The probability of an edit; can be'' or 'a | B 'or 'a | B + C | D '."
If edit = '': Return (1.-p_spell_error)
Return p_spell_error * product (p1edit (e) For E in edit. Split ('+ '))

P_spell_error = 1./20.

P1edit = pdist(datafile('count_1edit.txt ') # probabilities of single edits

Def edits (word, D = 2 ):
"Return a dict of {correct: Edit} pairs within D edits of word ."
Results = {}
Def editsr (HD, Tl, D, edits ):
Def ed (L, R): Return edits + [R + '|' + L]
C = HD + tl
If C in PW:
E = '+'. Join (edits)
If C not in results: Results [c] = E
Else: Results [c] = max (results [c], E, key = pedit)
If D <= 0: Return
Extensions = [hd + C for C in Alphabet if HD + C in prefixes]
P = (HD [-1] If HD else '<') # previous character
# Insertion
For H in extensions:
Editsr (H, Tl, D-1, Ed (p + H [-1], p ))
If not TL: Return
# Deletion
Editsr (HD, Tl [1:], D-1, Ed (p, p + TL [0])
For H in extensions:
If H [-1] = TL [0]: # match
Editsr (H, Tl [1:], D, edits)
Else: ## replacement
Editsr (H, Tl [1:], D-1, Ed (H [-1], Tl [0])
# Transpose
If Len (TL)> = 2 and TL [0]! = TL [1] and HD + TL [1] In prefixes:
Editsr (HD + TL [1], Tl [0] + TL [2:], D-1,
Ed (TL [1] + TL [0], Tl [0: 2])
# Body of edits:
Editsr ('', word, D, [])
Return results

Prefixes = set (W [: I] for W in Pw for I in range (LEN (w) + 1 ))

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.