-
Notifications
You must be signed in to change notification settings - Fork 1
/
trie.py
165 lines (134 loc) · 4.44 KB
/
trie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Trie Data Structure
# ---------------------------------------------------------------------------
# Reference:
# - https://stackoverflow.com/questions/11015320/how-to-create-a-trie-in-python\
# ?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
import sys
from pprint import pprint
from collections import defaultdict
from util import *
# lib: https://github.com/caleb531/automata
from automata.fa.dfa import *
from automata.fa.nfa import *
TERMINATE = "$"
def corpus2trie(corpus_path):
"""
return {word} as a Trie object
"""
words = read_dict(corpus_path)
return Trie(words)
class Trie:
root = {}
n_words = 0
n_nodes = 1
is_patricia = False
def __init__(self, words):
"""construct a trie for a dictionary of words
Args:
words: [word]
Return:
"""
root = dict() # root of trie
self.n_words = len(words)
for w in words:
curr = root
for i, ch in enumerate(w):
if ch not in curr:
self.n_nodes += 1
curr[ch] = {}
# accepting?
if i == len(w) - 1:
curr[ch][TERMINATE] = True
curr = curr[ch]
self.root = root
def stats(self):
"""
Args:
Return:
"""
s = "Number of words: {:10}\n".format(self.n_words)
s += "Number of nodes: {:10}\n".format(self.n_nodes)
return s
def __str__(self):
"""print a trie in a pretty way
Args:
trie: multi-level dictionary
Return:
"""
s = ""
stack = []
for ch, children in self.root.items():
stack.append((ch, children, 0))
while stack:
ch, children, depth = stack.pop()
accepted = children.get(TERMINATE, False)
if accepted:
s += ("{} -({})\n".format(" "*depth, ch))
else:
s += ("{} - {} \n".format(" "*depth, ch))
depth += 1
for c, cc in children.items():
if (c == TERMINATE): continue
stack.append((c, cc, depth))
return s
def to_DFA(self):
"""construct a DFA from a trie
Return: a DFA object
"""
initial_state = "^"
states = {initial_state}
final_states = set()
transitions = defaultdict(dict)
queue = [(initial_state, self.root)]
state = 0
while queue:
k, children = queue.pop(0)
if (children.get(TERMINATE, False)):
final_states.add(k)
for c, cc in children.items():
if (c == TERMINATE): continue
c_state = k + c
states.add(c_state)
transitions[k][c] = {c_state}
queue.append((c_state, cc))
nfa = NFA(states = states,\
transitions = transitions,\
initial_state = initial_state,\
final_states = final_states,\
input_symbols = set(ALPHABETS))
return DFA.from_nfa(nfa)
def accept(self, node):
"""return true if ndoe is accepted
Args:
node: {}
Return:
"""
return TERMINATE in node
def to_patria_trie(self, verbose = True):
"""compress into a patria trie
Return:
"""
ans = defaultdict(dict)
self.n_nodes = 0
# [(parent, key, node)]
stack = list((ans, k, v) for (k, v) in self.root.items())
while stack:
parent, key, node = stack.pop()
if (key == TERMINATE):
parent[key] = node
continue
acc_key = key # accumulative key
while len(node.keys()) == 1 and TERMINATE not in node: # silly and non-terminal node
k, node = node.popitem()
acc_key += k
# pop old key from parent
parent.pop(key, None)
parent[acc_key] = node
self.n_nodes += 1
for k, v in node.items():
if k == TERMINATE: continue
stack.append((node, k, v))
if verbose:
print("Trie Compressed: {} -> {}".format(sys.getsizeof(self.root), sys.getsizeof(ans)))
self.root = ans
self.is_patricia = True