-
Notifications
You must be signed in to change notification settings - Fork 68
/
run_ngrams.py
235 lines (181 loc) · 7.34 KB
/
run_ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from rasa.nlu.training_data import load_data
from rasa.nlu.model import Trainer
from rasa.nlu import config
from rasa.nlu.test import run_evaluation, cross_validate
import logging
import re
import numpy.random as rd
import numpy as np
import random
import string
import matplotlib.pylab as plt
# to get the logging stream into a string
try:
from cStringIO import StringIO
except ImportError:
from io import StringIO
"""
This script investigates whether adding ngrams improves the robustness of the
NLU model. It is run simply as
python run_ngrams.py
and uses the company dataset for the testings.
Durig the run, noise is added to the dataset.
With some probability 'noise', each of the company entity characters are
flipped to a random value.
This is averaged over several runs per value of 'noise'.
"""
def train_model(td_file, config_file, model_dir):
"""trains a model using the training data and config
creates model and returns the path to this model for evaluation"""
td = load_data(td_file)
trainer = Trainer(config.load(config_file))
trainer.train(td)
model_loc = trainer.persist(model_dir)
return model_loc
def train_test(td_file, config_file, model_dir, key="company", noise=0.1):
"""trains a model using the training data
(split into train-test) and config"""
td = load_data(td_file)
trainer = Trainer(config.load(config_file))
train, test = td.train_test_split(train_frac=0.8)
test = add_noise(test, key, noise=noise)
trainer.train(train)
tmp_fname = "data/tmp/temp_test.json"
model_loc = trainer.persist(model_dir)
with open(tmp_fname, "w", encoding="utf8") as f:
f.write(test.as_json())
evaluate_model(tmp_fname, model_loc)
def add_noise(td, key, noise=0.2):
"""with probability 'noise', randomizes each character of company entities"""
entity = key # the entity is just the key.
for ex in tuple(td.training_examples):
entities = ex.get("entities")
text = ex.text
if entities:
for e in entities:
if e["entity"] == entity:
value = e["value"]
new_value = list(value)
for i in range(len(value)):
if rd.random() < noise and new_value[i] != " ":
new_value[i] = random.choice(string.ascii_lowercase)
new_value = "".join(new_value)
e["value"] = new_value
old_text = ex.as_dict()["text"]
if old_text:
new_text = list(old_text)
new_text[e["start"] : e["end"] + 1] = new_value
new_value = "".join(new_text)
ex.text = "".join(new_text)
if entities:
ex.set("entities", entities)
return td
def CV_eval(td_file, config_file, Nfolds=10):
"""trains a model with crossvalidation using the training data and config"""
td = load_data(td_file)
configuration = config.load(config_file)
cross_validate(td, Nfolds, configuration)
def evaluate_model(td_file, model_loc):
"""evaluates the model on the training data."""
run_evaluation(td_file, model_loc)
def get_path_dicts(key):
"""gets the right training data and model directory given the demo"""
td_dict = {"company": "data/company/company_full.json"}
td_lookup_dict = {"company": "data/company/company_full_lookup.json"}
td_ngrams_dict = {"company": "data/company/company_full_ngrams.json"}
td_both_dict = {"company": "data/company/company_full_both.json"}
model_dir_dict = {"company": "data/models"}
td = td_dict[key]
td_lookup = td_lookup_dict[key]
td_ngrams = td_ngrams_dict[key]
td_both = td_both_dict[key]
model_dir = model_dir_dict[key]
return td, td_lookup, td_ngrams, td_both, model_dir
def run_demo(key, noise=0.1):
"""runs the demo specified by key"""
td, td_lookup, td_ngrams, td_both, model_dir = get_path_dicts(key)
config_file = "configs/config.yaml"
print("running normal...")
train_test(td, config_file, model_dir, noise=noise)
print("running lookups...")
train_test(td_lookup, config_file, model_dir, noise=noise)
print("running ngrams...")
train_test(td_ngrams, config_file, model_dir, noise=noise)
print("running both...")
train_test(td_both, config_file, model_dir, noise=noise)
metric_list = strip_metrics(key)
return metric_list
def parse_metrics(match, key):
"""Gets the metrics out of the parsed logger stream"""
elements = match.split(" ")[1:]
elements = filter(lambda x: len(x) > 2, elements)
elements = [float(e) for e in elements]
metrics = dict(zip(["key", "precision", "recall", "f1"], [key] + elements))
return metrics
def strip_metrics(key):
# steals the logger stream and returns the metrics associated with key
stream_string = log_stream.getvalue()
stream_literal = repr(stream_string)
p_re = re.compile(key + "[ ]+\d.\d\d[ ]+\d.\d\d[ ]+\d.\d\d")
matches = p_re.findall(stream_literal)
metric_list = [parse_metrics(m, key) for m in matches]
return metric_list
def print_metrics(metric_list):
"""Prints the metrics for each training data"""
if metric_list:
key = metric_list[-4]["key"]
print("before adding lookup table(s), demo '{}' had:".format(key))
display_metrics(metric_list[-4])
print("after adding lookup table(s), demo '{}' had:".format(key))
display_metrics(metric_list[-3])
print("after adding ngrams, demo '{}' had:".format(key))
display_metrics(metric_list[-2])
print(
"after adding both lookup table(s) and ngrams, demo '{}' had:".format(key)
)
display_metrics(metric_list[-1])
else:
raise ValueError("metrics were not parsed correctly.")
def display_metrics(metrics):
"""Prints the metrics"""
for key, val in metrics.items():
print("\t{}:\t{}".format(key, val))
if __name__ == "__main__":
# capture logging to string
log_stream = StringIO()
logging.basicConfig(stream=log_stream, level=logging.INFO)
# number to average over
N_avg = 7
# number of noise points
N_noise = 20
# probability of randomizing each character of key entity
noises = list(np.linspace(0, 0.5, N_noise))
# store the F1 scores
f1s = {
"normal": [[] for _ in range(N_avg)],
"lookup": [[] for _ in range(N_avg)],
"ngram": [[] for _ in range(N_avg)],
"both": [[] for _ in range(N_avg)],
}
# cases to examine
cases = ["normal", "lookup", "ngram", "both"]
count = 1
for avg_index in range(N_avg):
for noise in noises:
print("working on run {} of {}".format(count, N_avg * N_noise))
print(" noise = {}".format(noise))
count += 1
log_stream.flush()
# run the demo on company
metric_list = run_demo("company", noise=noise)
for i, e in enumerate(cases):
f1s[e][avg_index].append(metric_list[-4 + i]["f1"])
for case, f1_list in f1s.items():
# compute the average over all runs for each noise level
f1s_T = list(map(list, zip(*f1_list)))
avgs = [sum(f) / N_avg for f in f1s_T]
plt.plot(noises, avgs)
plt.legend(cases)
plt.xlabel("noise")
plt.ylabel("f1s")
plt.show()