Ngrams-Naive Bayes method word segmentation-Python

Last Update:2018-12-07 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

"""
Code to accompany the chapter "natural language corpus data"
From the book "beautiful data" (segaran and hammerbacher, 2009)
Http://oreilly.com/catalog/9780596157111/

You are free to use this code under the MIT licencse:
Http://www.opensource.org/licenses/mit-license.php
"""

Import re, String, random, glob, operator, heapq
From collections import defaultdict
From math import log10

Def memo (f ):
"Memoize function f ."
Table = {}
Def fmemo (* ARGs ):
If ARGs not in table:
Table [ARGs] = f (* ARGs)
Return table [ARGs]
Fmemo. Memo = table
Return fmemo

Def test (verbose = none ):
"Run some tests, taken from the chapter.
Since the hillclimbing algorithm is randomized, some tests may fail ."""
Import doctest
Print 'running tests ...'
Doctest.testfile('ngrams-test.txt ', verbose = verbose)

############### Word segmentation (p. 223)

@ Memo
Def segment (text ):
"Return a list of words that is the best segmentation of text ."
If not text: return []
Candidates = ([first] + segment (REM) for first, REM in Splits (text ))
Return max (candidates, key = pwords)

Def splits (text, L = 20 ):
"Return a list of all possible (first, REM) pairs, Len (first) <= L ."
Return [(Text [: I + 1], text [I + 1:])
For I in range (min (LEN (text), L)]

Def pwords (words ):
"The Naive Bayes Probability of a sequence of words ."
Return product (PW (w) for W in words)

#### Support functions (p. 224)

Def product (Nums ):
"Return the product of a sequence of numbers ."
Return reduce (operator. Mul, Nums, 1)

class pdist (dict):
"a probability distribution estimated from counts in datafile. "
def _ init _ (self, Data = [], n = none, missingfn = none):
for key, count in data:
Self [Key] = self. get (Key, 0) + int (count)
self. N = float (N or sum (self. itervalues ()
self. missingfn = missingfn or (lambda k, n: 1. /n)
def _ call _ (self, key):
If key in self: return self [Key]/self. n
else: return self. missingfn (Key, self. n)

Def datafile (name, SEP = '\ t '):
"Read key, value pairs from file ."
For line in file (name ):
Yield Line. Split (SEP)

Def avoid_long_words (Key, n ):
"Estimate the probability of an unknown word ."
Return 10./(n * 10 ** Len (key ))

N = 1024908267229 # Number of tokens

PW = pdist(datafile('count_11_txt '), N, avoid_long_words)

#### Segment2: second version, with bigram counts, (p. 226-227)

Def CPW (word, Prev ):
"Conditional probability of word, given previous word ."
Try:
Return P2w [Prev + ''+ word]/float (PW [Prev])
Failed t keyerror:
Return Pw (word)

P2w = pdist(datafile('count_20000txt '), n)

@ Memo
Def segment2 (text, Prev = '<S> '):
"Return (log P (words), words), where words is the best segmentation ."
If not text: Return 0.0, []
Candidates = [combine (log10 (CPW (first, Prev), first, segment2 (REM, first ))
For first, REM in Splits (text)]
Return max (candidates)

Def combine (pfirst, first, (Prem, REM )):
"Combine first and REM results into one (probability, words) pair ."
Return pfirst + Prem, [first] + REM

############### Secret codes (p. 228-230)

Def encode (MSG, key ):
"Encode a message with a substitution cipher ."
Return MSG. Translate (string. maketrans (UL (alphabet), UL (key )))

Def ul (text): return text. Upper () + text. Lower ()

Alphabet = 'abcdefghijklmnopqrstuvwxy'

Def shift (MSG, n = 13 ):
"Encode a message with a shift (CAESAR) cipher ."
Return encode (MSG, alphabet [N:] + alphabet [: N])

Def logpwords (words ):
"The Naive Bayes Probability of a string or sequence of words ."
If isinstance (words, STR): words = allwords (words)
Return sum (log10 (PW (w) for W in words)

Def allwords (text ):
"Return a list of alphabetic words in text, lowercase ."
Return re. findall ('[A-Z] +', text. Lower ())

Def decode_shift (MSG ):
"Find the best decoding of a message encoded with a shift cipher ."
Candidates = [shift (MSG, n) for N in range (LEN (alphabet)]
Return max (candidates, key = logpwords)

Def shift2 (MSG, n = 13 ):
"Encode with a shift (CAESAR) cipher, yielding only letters [A-Z]."
Return shift (just_letters (MSG), n)

Def just_letters (text ):
"Lowercase text and remove all characters character T [A-Z]."
Return re. sub ('[^ A-Z]', '', text. Lower ())

Def decode_shift2 (MSG ):
"Decode a message encoded with a shift cipher, with no spaces ."
Candidates = [segment2 (shift (MSG, n) for N in range (LEN (alphabet)]
P, words = max (candidates)
Return ''. Join (words)

#### General substitution cipher (p.m. 231-233)

Def logp3letters (text ):
"The log-probability of text using a letter 3-gram model ."
Return sum (log10 (p3l (G) for G in ngrams (text, 3 ))

Def ngrams (SEQ, n ):
"List all the (OVERLAPPING) ngrams in a sequence ."
Return [seq [I: I + N] For I in range (1 + Len (SEQ)-N)]

P3l = pdist(datafile('count_3l.txt '))
P2l = pdist(datafile('count_2l.txt ') # We'll need it later

Def hillclimb (x, F, neighbors, steps = 10000 ):
"Search for an X that miximizes f (x), considering neighbors (x )."
FX = f (x)
Neighborhood = ITER (neighbors (x ))
For I in range (Steps ):
X2 = neighborhood. Next ()
Fx2 = f (X2)
If fx2> FX:
X, FX = x2, fx2
Neighborhood = ITER (neighbors (x ))
If debugging: Print 'hillclimb: ', X, INT (FX)
Return x

Debugging = false

Def decode_subst (MSG, steps = 4000, restarts = 90 ):
"Decode a substitution cipher with random restart hillclimbing ."
MSG = CAT (allwords (MSG ))
Candidates = [hillclimb (encode (MSG, key = CAT (shuffled (alphabet ))),
Logp3letters, neighboring_msgs, steps)
For _ in range (restarts)]
P, words = max (segment2 (c) for C in candidates)
Return ''. Join (words)

Def shuffled (SEQ ):
"Return a randomly shuffled copy of the input sequence ."
SEQ = List (SEQ)
Random. Shuffle (SEQ)
Return seq

Cat = ''. Join

Def neighboring_msgs (MSG ):
"Generate nearby keys, hopefully better ones ."
Def swap (A, B): Return MSG. Translate (string. maketrans (A + B, B + ))
For bigram in heapq. nsmallest (20, set (ngrams (MSG, 2), p2l ):
B1, b2 = bigram
For C in alphabet:
If b1 = B2:
If p2l (C + C)> p2l (bigram): yield swap (C, B1)
Else:
If p2l (C + b2)> p2l (bigram): yield swap (C, B1)
If p2l (b1 + C)> p2l (bigram): yield swap (C, B2)
While true:
Yield swap (random. Choice (alphabet), random. Choice (alphabet ))

############### Spelling correction (p. 236 -)

Def corrections (text ):
"Spell-correct all words in text ."
Return re. sub ('[A-Za-Z] +', Lambda M: Correct (M. group (0), text)

Def correct (W ):
"Return the word that is the most likely spell correction of W ."
Candidates = edits (W). Items ()
C, edit = max (candidates, key = Lambda (C, E): pedit (e) * Pw (c ))
Return C

Def pedit (edit ):
"The probability of an edit; can be'' or 'a | B 'or 'a | B + C | D '."
If edit = '': Return (1.-p_spell_error)
Return p_spell_error * product (p1edit (e) For E in edit. Split ('+ '))

P_spell_error = 1./20.

P1edit = pdist(datafile('count_1edit.txt ') # probabilities of single edits

Def edits (word, D = 2 ):
"Return a dict of {correct: Edit} pairs within D edits of word ."
Results = {}
Def editsr (HD, Tl, D, edits ):
Def ed (L, R): Return edits + [R + '|' + L]
C = HD + tl
If C in PW:
E = '+'. Join (edits)
If C not in results: Results [c] = E
Else: Results [c] = max (results [c], E, key = pedit)
If D <= 0: Return
Extensions = [hd + C for C in Alphabet if HD + C in prefixes]
P = (HD [-1] If HD else '<') # previous character
# Insertion
For H in extensions:
Editsr (H, Tl, D-1, Ed (p + H [-1], p ))
If not TL: Return
# Deletion
Editsr (HD, Tl [1:], D-1, Ed (p, p + TL [0])
For H in extensions:
If H [-1] = TL [0]: # match
Editsr (H, Tl [1:], D, edits)
Else: ## replacement
Editsr (H, Tl [1:], D-1, Ed (H [-1], Tl [0])
# Transpose
If Len (TL)> = 2 and TL [0]! = TL [1] and HD + TL [1] In prefixes:
Editsr (HD + TL [1], Tl [0] + TL [2:], D-1,
Ed (TL [1] + TL [0], Tl [0: 2])
# Body of edits:
Editsr ('', word, D, [])
Return results

Prefixes = set (W [: I] for W in Pw for I in range (LEN (w) + 1 ))

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Ngrams-Naive Bayes method word segmentation-Python

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Ngrams-Naive Bayes method word segmentation-Python

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support