forked from norvig/pytudes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
old_spell.py
127 lines (108 loc) · 5.1 KB
/
old_spell.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html
Copyright (c) 2007-2016 Peter Norvig
MIT license: www.opensource.org/licenses/mit-license.php
"""
################ Spelling Corrector
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('big.txt').read()))
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return float(WORDS[word]) / N
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
################ Test Code
def unit_tests():
assert correction('speling') == 'spelling' # insert
assert correction('korrectud') == 'corrected' # replace 2
assert correction('bycycle') == 'bicycle' # replace
assert correction('inconvient') == 'inconvenient' # insert 2
#assert correction('arrainged') == 'arranged' # delete
assert correction('peotry') =='poetry' # transpose
assert correction('peotryy') =='poetry' # transpose + delete
assert correction('word') == 'word' # known
assert correction('quintessential') == 'quintessential' # unknown
assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
assert Counter(words('This is a test. 123; A TEST this is.')) == (
Counter({'123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2}))
assert len(WORDS) == 32198
assert sum(WORDS.values()) == 1115585
assert WORDS.most_common(10) == [
('the', 79809),
('of', 40024),
('and', 38312),
('to', 28765),
('in', 22023),
('a', 21124),
('that', 12512),
('he', 12401),
('was', 11410),
('it', 10681)]
assert WORDS['the'] == 79809
assert P('quintessential') == 0
assert 0.07 < P('the') < 0.08
return 'unit_tests pass'
def spelltest(tests, verbose=False, log=False):
"Run correction(wrong) on all (right, wrong) pairs; report results."
import time
if log:
import datetime
filename = 'spell_log_' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
f = open(filename,'w')
start = time.clock()
good, unknown = 0, 0
n = len(tests)
for right, wrong in tests:
w = correction(wrong)
good += (w == right)
if w != right:
unknown += (right not in WORDS)
if verbose:
print('correction({}) => {} ({}); expected {} ({})'
.format(wrong, w, WORDS[w], right, WORDS[right]))
if log:
print >>f, 'correction({}) => {} ({}); expected {} ({})'.format(wrong, w, WORDS[w], right, WORDS[right])
if log:
f.close()
dt = time.clock() - start
print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '
.format(float(good) / n, n, float(unknown) / n, float(n) / dt))
def Testset(lines):
"Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
return [(right, wrong)
for (right, wrongs) in (line.split(':') for line in lines)
for wrong in wrongs.split()]
if __name__ == '__main__':
print(unit_tests())
spelltest(Testset(open('spell-testset1.txt')), False, True)
spelltest(Testset(open('spell-testset2.txt')), False, True)
spelltest(Testset(open('spell-testset-complete.txt')), False, True)
#### https://www.dcs.bbk.ac.uk/~ROGER/corpora.html
#### wget https://www.dcs.bbk.ac.uk/~ROGER/missp.dat
#### $ cat missp.dat | tr '\n' ' ' | tr '$' '\n' | awk '{$2=":"OFS$2}1' | sed 's/ : /: /g' > spell-testset-complete.txt
#### wget https://www.dcs.bbk.ac.uk/~ROGER/missp.dat -O birkbeck.dat
#### cat birkbeck.dat | tr '\n' ' ' | tr '$' '\n' | awk '{$2=":"OFS$2}1' | sed 's/ : /: /g' > spell-testset-birkbeck.txt
#### wget https://www.dcs.bbk.ac.uk/~ROGER/aspell.dat
#### cat aspell.dat | tr '\n' ' ' | tr '$' '\n' | awk '{$2=":"OFS$2}1' | sed 's/ : /: /g' > spell-testset-aspell.txt
#### wget https://www.dcs.bbk.ac.uk/~ROGER/wikipedia.dat
#### cat wikipedia.dat | tr '\n' ' ' | tr '$' '\n' | awk '{$2=":"OFS$2}1' | sed 's/ : /: /g' > spell-testset-wikipedia.txt