-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcolor_eval_utils.py
355 lines (291 loc) · 13.5 KB
/
color_eval_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
"""This file is going to hold some useful functions/classes used in the
Replication Evaluation Notebook that describes how the results were obtained"""
from itertools import chain # flatten
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import logsumexp
from scipy.stats import pearsonr, spearmanr, kendalltau
from nlgeval import NLGEval # for running baseline eval metrics
import caption_featurizers
import color_featurizers
from experiment import FeatureHandler
from models import LiteralListener, CaptionEncoder
from models import LiteralSpeakerScorer, CaptionGenerator
from monroe_data import MonroeData, MonroeDataEntry, Color
########################## UTILITY FUNCTIONS ##########################
def normalize(caption):
"""
Makes caption text lowercase and removes uneeded white space.
Args:
caption: a candidate caption string.
Returns:
caption lowercased, without extra whitespace or double quotes
"""
return caption.lower().strip().replace('"', "")
def flatten(ctxs):
"""
Turns 2-d list of lists into 1-d list.
Equivalent to np.flatten but for non-square 2-d lists.
Args:
ctxs: a 2-D list of lists of contexts.
[[ctx1.1, ctx1.2], [ctx2.1, ctx2.2, ctx2.3], ...]
Returns:
1-D list of the same contexts.
[ctx1.1, ctx1.2, ctx2.1, ctx2.2, ctx2.3, ...]
"""
return list(chain.from_iterable(ctxs))
def target_col_for_context_id(df, context_id):
"""
Gets the target color given a context id and a dataframe to search in.
Args:
df: the dataframe that holds the target color.
context_id: the id of the context whose target color is desired.
Returns:
Tuple of 3 floats representing the HSL coordinates of the target color.
"""
ctx_entry = df[df["contextId"] == context_id].iloc[0]
statuses = ['click', 'alt1', 'alt2']
color = (None, None, None)
for status in statuses:
if ctx_entry[f"{status}Status"] == "target":
color = (ctx_entry[f"{status}ColH"],
ctx_entry[f"{status}ColS"],
ctx_entry[f"{status}ColL"])
break
return color
########################## PLOTTING FUNCTIONS ###########################
def plot_score_dists_for_metric(scores, metric):
"""
Plots the scores assigned by a given metric on a violin plot.
Args:
scores: the scores to plot the distributions of. List of three
dictionaries that contain all the scores. One descriptive, one
ambiguous, and one misleading.
metric: name of metric that assigned the scores to label the graph.
Returns:
None. Just prints correlations and displays graph.
"""
plot_data = [score[metric][1] for score in scores]
# let's also get the correlation coefficients
# domain is [1, 1, 1, ...], [2, 2, 2, ...], [3, 3, 3, 3, ...]
correlation_support = []
for i, d in enumerate(plot_data):
correlation_support.extend([i+1] * len(d))
plot_data_flat = flatten(plot_data)
pearson_r = pearsonr(correlation_support, plot_data_flat)
spearman_r = spearmanr(correlation_support, plot_data_flat)
kendall_t = kendalltau(correlation_support, plot_data_flat)
print("Pearson r: {:.3f}, p = {:.3f}".format(pearson_r[0], pearson_r[1]))
print("Spearman r: {:.3f}, p = {:.3f}".format(spearman_r[0], spearman_r[1]))
print("Kendall τ: {:.3f}, p = {:.3f}".format(kendall_t[0], kendall_t[1]))
# now plot the violin plots
vpdata = plt.violinplot(plot_data, bw_method=0.2,
showmeans=True, showextrema=True)
vpdata["cbars"].set(linewidths=1)
vpdata["cmeans"].set(linewidths=1)
vpdata["cmins"].set(linewidths=1)
vpdata["cmaxes"].set(linewidths=1)
# add scatter plots on top of vilions
plt.scatter(correlation_support, plot_data_flat, alpha=0.01, s=5)
plt.xticks([1, 2, 3], ["Descriptive", "Ambiguous", "Misleading"])
plt.title("{} scores of captions".format(metric))
plt.show()
def plot_score_dists(*scores, metrics=['Bleu_1', 'ROUGE_L', 'METEOR', 'CIDEr']):
"""Plots distributions of scores asssigned from all the metrics given."""
for metric in metrics:
plot_score_dists_for_metric(scores, metric)
######################## METRIC CALCULATION WRAPPERS #########################
class NgramMetrics:
"""
Wrapper for calculating Ngram Overlap metrics.
Attributes:
nlgeval_metrics: the NLGEval object (from Sharma et al., 2018) that
performs all of the evaluations. We do not load the glove or
skipthought evaluaations and only use the n-gram overlap metrics.
"""
def __init__(self):
"""
Loads metrics without extra (slow) models.
Calculates BLEU-1, BLEU-2, BLEU-3, BLEU-4, ROUGE-L, METEOER, and CIDEr.
"""
self.nlgeval_metrics = NLGEval(no_overlap=False,
no_glove=True,
no_skipthoughts=True)
def get_overlap_scores(self, refs, hyps):
"""
Calculates n-gram scores given references and candidates.
Args:
refs: list of lists of all reference caption strings for each
candidate caption.
hyps: list of hypothesis or candidate captions.
Returns:
Dictionary of metric names to a tuple containing the mean score
and the list of all scores. This list is the same size as the
hyp list.
{
"BLUE_1" : (0.23, [0.41, 0.84, 0.31, ...]),
"ROUGE_L" : (0.45, [0.12, 0.68, 0.24, ...]),
...
}
"""
refs = {idx: lines for (idx, lines) in enumerate(refs)}
hyps = {idx: [line] for (idx, line) in enumerate(hyps)}
ret_scores = {}
for scorer, method in self.nlgeval_metrics.scorers:
score, scores = scorer.compute_score(refs, hyps)
if isinstance(method, list):
for sc, scs, m in zip(score, scores, method):
ret_scores[m] = (sc, scs)
else:
ret_scores[method] = (score, scores)
return ret_scores
class ListenerMetrics:
"""
Attributes:
train_data: MonroeData object that holds all of the model's training
data. It is used to fix the vocabulary for the model, but probably
should be swapped out for a lighter vocabuary file.
caption_phi: the caption feature function that takes care of tokenizing
and normalizing captions and converting their tokens to indices.
color_phi_ll: the color feautre function used by the literal listener.
It is based on a fourier feature representation derived from RGB
color space.
color_phi_pl: the color feature function used by the pragmatic listener.
It is also based on a fourier feature representation but is in the
HSV color space.
model_ll: the pretrained literal listener model that selects a color
given a caption.
model_pl: the pretrained literal speaker model that gives the
probability of a caption given a color context. It's used in the
pragmatic listener model.
"""
def __init__(self):
"""
Sets up training data, color/caption feature functions, and pretrained
models for evaluation.
"""
# Data used to generate vocab (this is suboptimal)
self.train_data = MonroeData("../data/csv/train_corpus_monroe.csv",
"../data/entries/train_entries_monroe.pkl")
self.caption_phi = caption_featurizers.CaptionFeaturizer(
tokenizer=caption_featurizers.EndingTokenizer)
# Literal Listener uses RGB for color features
self.color_phi_ll = color_featurizers.ColorFeaturizer(
color_featurizers.color_phi_fourier, "rgb", normalized=True)
# Pragmatic Listener uses HSV for color features
self.color_phi_pl = color_featurizers.ColorFeaturizer(
color_featurizers.color_phi_fourier, "hsv", normalized=True)
# placeholder feature handler to load vocab (this is ugly) TODO
fh = FeatureHandler(self.train_data, self.train_data,
self.caption_phi, self.color_phi_ll)
# load the listener models
self.model_ll = LiteralListener(CaptionEncoder)
self.model_ll.init_model(
embed_dim=100,
hidden_dim=100,
vocab_size=fh.caption_featurizer.caption_indexer.size,
color_dim=54)
self.model_ll.load_model(
"../model/literal_listener_5epoch_endings_tkn.params")
self.model_pl = LiteralSpeakerScorer(CaptionGenerator)
self.model_pl.init_model(
color_in_dim=54,
color_dim=100,
vocab_size=fh.caption_featurizer.caption_indexer.size,
embed_dim=100,
speaker_hidden_dim=100)
self.model_pl.load_model("../model/literal_speaker_30epochGLOVE.params")
def get_lit_listener_scores(self, eval_contexts):
"""
Queries Literal Listener model for target colors given a caption.
Args:
eval_contexts: list of caption-color contexts to evaluate with
the literal listener model.
Returns:
log-probability of the target color given the caption.
"""
feature_handler = FeatureHandler(self.train_data, # for vocab :(
eval_contexts,
self.caption_phi,
self.color_phi_ll,
randomized_colors=True)
X_assess = feature_handler.test_features()
y_assess = feature_handler.test_targets()
preds = self.model_ll.predict(X_assess)
scores = preds[np.arange(len(preds)), y_assess]
return scores
def speaker_target(self, data_entry):
"""
Gets gold tokens for finding caption probabilities.
Args:
data_entry: i.e. a color-caption context
Returns:
Tokenized caption with end token but no start token.
"""
_, caption_ids = self.caption_phi.to_string_features(data_entry.caption)
target = caption_ids[1:]
return target
def speaker_predictions_to_scores(self, results, targets):
"""
Converts speaker's predictions over tokens to a score for each color.
Args:
results: the outputted log-probs from the speaker models.
targets: the gold token sequences used for calculating the
probabilities of the captions.
Returns:
The log-probabilities the pragmatic speaker model assigns to the
true targets being the target color.
"""
# Gets the probability of the caption given each color is the target.
all_scores = []
target_lens = np.array([len(target) for target in targets])
for i, predictions in enumerate(results):
scores = [0, 0, 0]
for j, prediction in enumerate(predictions):
scores[j] = np.sum(
prediction[np.arange(target_lens[i]), targets[i]].numpy()
) # end tokens are already cut off by LSS model
all_scores.append(scores)
# Apply Bayes Rule in log space to get prob of color given caption
all_scores = np.array(all_scores)
all_scores_dist = (all_scores.T - logsumexp(all_scores, axis=1)).T
return all_scores_dist[:, 0] # the target is at index 0
def get_prag_listener_scores(self, eval_contexts):
"""
Queries Pragmatic Listener model to score each color given caption.
Args:
eval_contexts: the color-caption contexts being evaluated
Returns:
Log probabilites for all possible tokens for each position in the
gold caption for each color.
"""
feature_handler = FeatureHandler(self.train_data,
eval_contexts,
self.caption_phi,
self.color_phi_pl,
randomized_colors=False,
target_fn=self.speaker_target)
X_assess = feature_handler.test_features()
y_assess = feature_handler.test_targets()
preds = self.model_pl.predict(X_assess)
scores = self.speaker_predictions_to_scores(preds, y_assess)
return scores
def get_listener_scores(self, eval_contexts):
"""
Runs all listener eval metrics and return scores in a dictionary.
Wrapper function for handling the calling of the literal listener and
pragmatic listener evaluation functions.
Args:
eval_contexts: the color-caption contexts being scored.
Returns:
Dictionary containing the names of the two listener evaluation
methods as keys and a tuple of mean score and list of scores
assigned by those methods as values.
"""
ll_scores = np.exp(self.get_lit_listener_scores(eval_contexts))
pl_scores = np.exp(self.get_prag_listener_scores(eval_contexts))
results = {
"Literal Listener": (np.mean(ll_scores), ll_scores),
"Pragmatic Listener": (np.mean(pl_scores), pl_scores)
}
return results