diff --git a/README.md b/README.md index 0d70ab2..3316cb3 100644 --- a/README.md +++ b/README.md @@ -128,8 +128,8 @@ gives METEOR: 0.295797 ROUGE_L: 0.522104 CIDEr: 1.242192 - SkipThoughtsCosineSimilairty: 0.626149 - EmbeddingAverageCosineSimilairty: 0.884690 + SkipThoughtsCosineSimilarity: 0.626149 + EmbeddingAverageCosineSimilarity: 0.884690 VectorExtremaCosineSimilarity: 0.568696 GreedyMatchingScore: 0.784205 diff --git a/nlgeval/__init__.py b/nlgeval/__init__.py index a85db78..de7aeec 100644 --- a/nlgeval/__init__.py +++ b/nlgeval/__init__.py @@ -59,7 +59,7 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) cosine_similarity = np.max(cosine_similarity, axis=0).mean() - print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity)) + print("SkipThoughtsCosineSimilarity: %0.6f" % (cosine_similarity)) ret_scores['SkipThoughtCS'] = cosine_similarity del model @@ -142,7 +142,7 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False class NLGEval(object): glove_metrics = { - 'EmbeddingAverageCosineSimilairty', + 'EmbeddingAverageCosineSimilarity', 'VectorExtremaCosineSimilarity', 'GreedyMatchingScore', } @@ -180,6 +180,11 @@ def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False, self.metrics_to_omit = set() else: self.metrics_to_omit = set(metrics_to_omit) + # For backwards compatibility. + if 'EmbeddingAverageCosineSimilairty' in self.metrics_to_omit: + self.metrics_to_omit.remove('EmbeddingAverageCosineSimilairty') + self.metrics_to_omit.add('EmbeddingAverageCosineSimilarity') + assert len(self.metrics_to_omit - self.valid_metrics) == 0, \ "Invalid metrics to omit: {}".format(self.metrics_to_omit - self.valid_metrics) diff --git a/nlgeval/tests/test_nlgeval.py b/nlgeval/tests/test_nlgeval.py index 08b5d60..cd6590d 100644 --- a/nlgeval/tests/test_nlgeval.py +++ b/nlgeval/tests/test_nlgeval.py @@ -25,10 +25,11 @@ def test_compute_metrics_oo(self): self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) - self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilairty'], places=5) + self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilarity'], places=5) + self.assertEqual(scores['EmbeddingAverageCosineSimilarity'], scores['EmbeddingAverageCosineSimilairty']) self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) - self.assertEqual(11, len(scores)) + self.assertEqual(12, len(scores)) scores = n.compute_metrics(ref_list=[ [ @@ -53,10 +54,10 @@ def test_compute_metrics_oo(self): self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5) self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5) self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5) - self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5) + self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilarity'], places=5) self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5) - self.assertEqual(11, len(scores)) + self.assertEqual(12, len(scores)) # Non-ASCII tests. scores = n.compute_individual_metrics(ref=["Test en français.", @@ -70,10 +71,11 @@ def test_compute_metrics_oo(self): self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.9192341566085815, scores['SkipThoughtCS'], places=5) - self.assertAlmostEqual(0.906562, scores['EmbeddingAverageCosineSimilairty'], places=5) + self.assertAlmostEqual(0.906562, scores['EmbeddingAverageCosineSimilarity'], places=5) + self.assertEqual(scores['EmbeddingAverageCosineSimilarity'], scores['EmbeddingAverageCosineSimilairty']) self.assertAlmostEqual(0.815158, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.940959, scores['GreedyMatchingScore'], places=5) - self.assertEqual(11, len(scores)) + self.assertEqual(12, len(scores)) scores = n.compute_individual_metrics(ref=["テスト"], hyp="テスト") @@ -83,10 +85,10 @@ def test_compute_metrics_oo(self): self.assertAlmostEqual(0.0, scores['CIDEr'], places=3) self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3) self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3) - self.assertEqual(11, len(scores)) + self.assertEqual(12, len(scores)) def test_compute_metrics_omit(self): - n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilairty']) + n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilarity']) # Individual Metrics scores = n.compute_individual_metrics(ref=["this is a test", @@ -115,7 +117,8 @@ def test_compute_metrics(self): self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5) self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5) self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5) - self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5) + self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilarity'], places=5) + self.assertEqual(scores['EmbeddingAverageCosineSimilarity'], scores['EmbeddingAverageCosineSimilairty']) self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5) - self.assertEqual(11, len(scores)) + self.assertEqual(12, len(scores)) diff --git a/nlgeval/word2vec/evaluate.py b/nlgeval/word2vec/evaluate.py index 58822e8..5aa3942 100644 --- a/nlgeval/word2vec/evaluate.py +++ b/nlgeval/word2vec/evaluate.py @@ -51,6 +51,10 @@ def eval_emb_metrics(hypothesis, references, emb=None, metrics_to_omit=None): if metrics_to_omit is None: metrics_to_omit = set() + else: + if 'EmbeddingAverageCosineSimilairty' in metrics_to_omit: + metrics_to_omit.remove('EmbeddingAverageCosineSimilairty') + metrics_to_omit.add('EmbeddingAverageCosineSimilarity') emb_hyps = [] avg_emb_hyps = [] @@ -94,9 +98,11 @@ def eval_emb_metrics(hypothesis, references, emb=None, metrics_to_omit=None): extreme_emb_refs.append(extreme_emb_refsource) rval = [] - if 'EmbeddingAverageCosineSimilairty' not in metrics_to_omit: + if 'EmbeddingAverageCosineSimilarity' not in metrics_to_omit: cos_similarity = list(map(lambda refv: cosine_similarity(refv, avg_emb_hyps).diagonal(), avg_emb_refs)) cos_similarity = np.max(cos_similarity, axis=0).mean() + rval.append("EmbeddingAverageCosineSimilarity: %0.6f" % (cos_similarity)) + # For backwards compatibility with an old typo before Nov 20, 2019. rval.append("EmbeddingAverageCosineSimilairty: %0.6f" % (cos_similarity)) if 'VectorExtremaCosineSimilarity' not in metrics_to_omit: diff --git a/setup.py b/setup.py index ac5ff59..d68ec1e 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ reqs = [str(ir.req) for ir in install_reqs] setup(name='nlg-eval', - version='2.2', + version='2.3', description="Wrapper for multiple NLG evaluation methods and metrics.", author='Shikhar Sharma, Hannes Schulz, Justin Harris', author_email='shikhar.sharma@microsoft.com, hannes.schulz@microsoft.com, justin.harris@microsoft.com',