-
Notifications
You must be signed in to change notification settings - Fork 1
/
util.py
125 lines (102 loc) · 2.77 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# utility script
import string
import datetime, time
import pickle
ALPHABETS = list(string.ascii_lowercase)
EMPTY_STATE = "{}"
def read_dict(path):
"""
return [word]
"""
ans = []
with open(path) as f:
for w in f.readlines():
w = w.rstrip().lower()
# filter out non-alphabetic words
if (not w.isalpha()): continue
ans.append(w)
return ans
def read_train_data():
pairs = []
with open("./data/training_set/edit1s.txt") as f:
for l in f.readlines():
spelling, corrected = l.split('\t')
pairs.append((spelling, corrected))
return pairs
def get_web_dictionary():
return read_dict("/usr/share/dict/web2")
def get_web_dictionary_set():
"""
return {word} as a set
"""
return set(get_web_dictionary())
def print_now():
ts = time.time()
print(datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))
def save_data(data, path):
"""save a data to path
Args:
data:
path:
Return:
"""
with open(path, "wb") as f:
pickle.dump(data, f)
def load_data(path):
"""
Args:
path:
Return:
"""
with open(path, "rb") as f:
return pickle.load(f)
def edits_1(word):
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for (L, R) in splits if R]
subs = [L + c + R[1:] for (L, R) in splits if R for c in ALPHABETS]
inserts = [L + c + R for (L, R) in splits for c in ALPHABETS]
return set(deletes + inserts + subs)
def edits_k(k, word):
"""generate candidates within k edit-distance of word
Args:
k:
Return: {word}
Ref:
https://norvig.com/spell-correct.html
"""
ans = set([word])
pre = set([word])
for _ in range(k):
curr = set()
for w in pre:
curr |= edits_1(w)
ans |= curr
pre = curr
return ans
def corpus2set(corpus_path):
"""
return {word} as a set
"""
return set(read_dict(corpus_path))
def walk_dfa(dfa, w):
"""walk dfa on string w
Args:
dfa:
Return:
"""
width = 40
h = 2
s = dfa.initial_state
acc_s = ""
for i, ch in enumerate(w):
print("-" * width)
s = dfa.transitions[s][ch]
acc_s += ch
print("| current string: {}".format(acc_s))
print("| current state: {}".format(s))
print("| Accepted: {}".format(s in dfa.final_states))
print("-" * width)
if i < (len(w) - 1):
for j in range(h//2): print((" " * (width//2)) + "|" + (" " * (width//2)))
print((" " * (width//2)) + w[i+1])
for j in range(h//2): print((" " * (width//2)) + "|" + (" " * (width//2)))