Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Official ROUGE added #74

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions nlgeval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(), "METEOR"),
(Rouge(), "ROUGE_L"),
(Rouge(), ["Rouge_1", "Rouge_2", "Rouge_3", "Rouge_4", "Rouge_L", "Rouge_W", "Rouge_S*", "Rouge_SU*"]),
(Cider(), "CIDEr")
]
for scorer, method in scorers:
Expand Down Expand Up @@ -98,7 +98,7 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(), "METEOR"),
(Rouge(), "ROUGE_L"),
(Rouge(), ["Rouge_1", "Rouge_2", "Rouge_3", "Rouge_4", "Rouge_L", "Rouge_W", "Rouge_S*", "Rouge_SU*"]),
(Cider(), "CIDEr")
]
for scorer, method in scorers:
Expand Down Expand Up @@ -151,7 +151,7 @@ class NLGEval(object):
# Overlap
'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4',
'METEOR',
'ROUGE_L',
"Rouge_1", "Rouge_2", "Rouge_3", "Rouge_4", "Rouge_L", "Rouge_W", "Rouge_S*", "Rouge_SU*",
'CIDEr',

# Skip-thought
Expand All @@ -175,7 +175,6 @@ def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False,
Metrics to omit. Omitting Bleu_{i} will omit Bleu_{j} for j>=i.
:type metrics_to_omit: Optional[Collection[str]]
"""

if metrics_to_omit is None:
self.metrics_to_omit = set()
else:
Expand Down Expand Up @@ -210,8 +209,8 @@ def load_scorers(self):

if 'METEOR' not in self.metrics_to_omit:
self.scorers.append((Meteor(), "METEOR"))
if 'ROUGE_L' not in self.metrics_to_omit:
self.scorers.append((Rouge(), "ROUGE_L"))
if 'ROUGE' not in self.metrics_to_omit:
self.scorers.append((Rouge(), ["Rouge_1", "Rouge_2", "Rouge_3", "Rouge_4", "Rouge_L", "Rouge_W", "Rouge_S*", "Rouge_SU*"]))
if 'CIDEr' not in self.metrics_to_omit:
self.scorers.append((Cider(), "CIDEr"))

Expand Down Expand Up @@ -273,6 +272,7 @@ def compute_individual_metrics(self, ref, hyp):
value = float(value.strip())
ret_scores[name] = value

self.load_scorers() # Official ROUGE script need to reloaded, or it throws error...
return ret_scores

def compute_metrics(self, ref_list, hyp_list):
Expand Down Expand Up @@ -310,4 +310,5 @@ def compute_metrics(self, ref_list, hyp_list):
value = float(value.strip())
ret_scores[name] = value

self.load_scorers() # Official ROUGE script need to reloaded, or it throws error...
return ret_scores
295 changes: 295 additions & 0 deletions nlgeval/pycocoevalcap/rouge/official_rouge/README.txt

Large diffs are not rendered by default.

232 changes: 232 additions & 0 deletions nlgeval/pycocoevalcap/rouge/official_rouge/RELEASE-NOTE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
# Revision Note: 05/26/2005, Chin-Yew LIN
# 1.5.5
# (1) Correct stemming on multi-token BE heads and modifiers.
# Previously, only single token heads and modifiers were assumed.
# (2) Correct the resampling routine which ignores the last evaluation
# item in the evaluation list. Therefore, the average scores reported
# by ROUGE is only based on the first N-1 evaluation items.
# Thanks Barry Schiffman at Columbia University to report this bug.
# This bug only affects ROUGE-1.5.X. For pre-1.5 ROUGE, it only affects
# the computation of confidence interval (CI) estimation, i.e. CI is only
# estimated by the first N-1 evaluation items, but it *does not* affect
# average scores.
# (3) Change read_text and read_text_LCS functions to read exact words or
# bytes required by users. Previous versions carry out whitespace
# compression and other string clear up actions before enforce the length
# limit.
# 1.5.4.1
# (1) Minor description change about "-t 0" option.
# 1.5.4
# (1) Add easy evalution mode for single reference evaluations with -z
# option.
# 1.5.3
# (1) Add option to compute ROUGE score based on SIMPLE BE format. Given
# a set of peer and model summary file in BE format with appropriate
# options, ROUGE will compute matching scores based on BE lexical
# matches.
# There are 6 options:
# 1. H : Head only match. This is similar to unigram match but
# only BE Head is used in matching. BEs generated by
# Minipar-based breaker do not include head-only BEs,
# therefore, the score will always be zero. Use HM or HMR
# optiions instead.
# 2. HM : Head and modifier match. This is similar to bigram or
# skip bigram but it's head-modifier bigram match based on
# parse result. Only BE triples with non-NIL modifier are
# included in the matching.
# 3. HMR : Head, modifier, and relation match. This is similar to
# trigram match but it's head-modifier-relation trigram
# match based on parse result. Only BE triples with non-NIL
# relation are included in the matching.
# 4. HM1 : This is combination of H and HM. It is similar to unigram +
# bigram or skip bigram with unigram match but it's
# head-modifier bigram match based on parse result.
# In this case, the modifier field in a BE can be "NIL"
# 5. HMR1 : This is combination of HM and HMR. It is similar to
# trigram match but it's head-modifier-relation trigram
# match based on parse result. In this case, the relation
# field of the BE can be "NIL".
# 6. HMR2 : This is combination of H, HM and HMR. It is similar to
# trigram match but it's head-modifier-relation trigram
# match based on parse result. In this case, the modifier and
# relation fields of the BE can both be "NIL".
# 1.5.2
# (1) Add option to compute ROUGE score by token using the whole corpus
# as average unit instead of individual sentences. Previous versions of
# ROUGE uses sentence (or unit) boundary to break counting unit and takes
# the average score from the counting unit as the final score.
# Using the whole corpus as one single counting unit can potentially
# improve the reliablity of the final score that treats each token as
# equally important; while the previous approach considers each sentence as
# equally important that ignores the length effect of each individual
# sentences (i.e. long sentences contribute equal weight to the final
# score as short sentences.)
# +v1.2 provide a choice of these two counting modes that users can
# choose the one that fits their scenarios.
# 1.5.1
# (1) Add precision oriented measure and f-measure to deal with different lengths
# in candidates and references. Importance between recall and precision can
# be controled by 'alpha' parameter:
# alpha -> 0: recall is more important
# alpha -> 1: precision is more important
# Following Chapter 7 in C.J. van Rijsbergen's "Information Retrieval".
# http://www.dcs.gla.ac.uk/Keith/Chapter.7/Ch.7.html
# F = 1/(alpha * (1/P) + (1 - alpha) * (1/R)) ;;; weighted harmonic mean
# 1.4.2
# (1) Enforce length limit at the time when summary text is read. Previously (before
# and including v1.4.1), length limit was enforced at tokenization time.
# 1.4.1
# (1) Fix potential over counting in ROUGE-L and ROUGE-W
# In previous version (i.e. 1.4 and order), LCS hit is computed
# by summing union hit over all model sentences. Each model sentence
# is compared with all peer sentences and mark the union LCS. The
# length of the union LCS is the hit of that model sentence. The
# final hit is then sum over all model union LCS hits. This potentially
# would over count a peer sentence which already been marked as contributed
# to some other model sentence. Therefore, double counting is resulted.
# This is seen in evalution where ROUGE-L score is higher than ROUGE-1 and
# this is not correct.
# ROUGEeval-1.4.1.pl fixes this by add a clip function to prevent
# double counting.
# 1.4
# (1) Remove internal Jackknifing procedure:
# Now the ROUGE script will use all the references listed in the
# <MODEL></MODEL> section in each <EVAL></EVAL> section and no
# automatic Jackknifing is performed.
# If Jackknifing procedure is required when comparing human and system
# performance, then users have to setup the procedure in the ROUGE
# evaluation configuration script as follows:
# For example, to evaluate system X with 4 references R1, R2, R3, and R4.
# We do the following computation:
#
# for system: and for comparable human:
# s1 = X vs. R1, R2, R3 h1 = R4 vs. R1, R2, R3
# s2 = X vs. R1, R3, R4 h2 = R2 vs. R1, R3, R4
# s3 = X vs. R1, R2, R4 h3 = R3 vs. R1, R2, R4
# s4 = X vs. R2, R3, R4 h4 = R1 vs. R2, R3, R4
#
# Average system score for X = (s1+s2+s3+s4)/4 and for human = (h1+h2+h3+h4)/4
# Implementation of this in a ROUGE evaluation configuration script is as follows:
# Instead of writing all references in a evaluation section as below:
# <EVAL ID="1">
# ...
# <PEERS>
# <P ID="X">systemX</X>
# <PEERS>
# <MODELS>
# <M ID="1">R1</M>
# <M ID="2">R2</M>
# <M ID="3">R3</M>
# <M ID="4">R4</M>
# <MODELS>
# </EVAL>
# we write the following:
# <EVAL ID="1-1">
# <PEERS>
# <P ID="X">systemX</X>
# <PEERS>
# <MODELS>
# <M ID="2">R2</M>
# <M ID="3">R3</M>
# <M ID="4">R4</M>
# <MODELS>
# </EVAL>
# <EVAL ID="1-2">
# <PEERS>
# <P ID="X">systemX</X>
# <PEERS>
# <MODELS>
# <M ID="1">R1</M>
# <M ID="3">R3</M>
# <M ID="4">R4</M>
# <MODELS>
# </EVAL>
# <EVAL ID="1-3">
# <PEERS>
# <P ID="X">systemX</X>
# <PEERS>
# <MODELS>
# <M ID="1">R1</M>
# <M ID="2">R2</M>
# <M ID="4">R4</M>
# <MODELS>
# </EVAL>
# <EVAL ID="1-4">
# <PEERS>
# <P ID="X">systemX</X>
# <PEERS>
# <MODELS>
# <M ID="1">R1</M>
# <M ID="2">R2</M>
# <M ID="3">R3</M>
# <MODELS>
# </EVAL>
#
# In this case, the system and human numbers are comparable.
# ROUGE as it is implemented for summarization evaluation is a recall-based metric.
# As we increase the number of references, we are increasing the number of
# count units (n-gram or skip-bigram or LCSes) in the target pool (i.e.
# the number ends up in the denominator of any ROUGE formula is larger).
# Therefore, a candidate summary has more chance to hit but it also has to
# hit more. In the end, this means lower absolute ROUGE scores when more
# references are used and using different sets of rerferences should not
# be compared to each other. There is no nomalization mechanism in ROUGE
# to properly adjust difference due to different number of references used.
#
# In the ROUGE implementations before v1.4 when there are N models provided for
# evaluating system X in the ROUGE evaluation script, ROUGE does the
# following:
# (1) s1 = X vs. R2, R3, R4, ..., RN
# (2) s2 = X vs. R1, R3, R4, ..., RN
# (3) s3 = X vs. R1, R2, R4, ..., RN
# (4) s4 = X vs. R1, R2, R3, ..., RN
# (5) ...
# (6) sN= X vs. R1, R2, R3, ..., RN-1
# And the final ROUGE score is computed by taking average of (s1, s2, s3,
# s4, ..., sN). When we provide only three references for evaluation of a
# human summarizer, ROUGE does the same thing but using 2 out 3
# references, get three numbers, and then take the average as the final
# score. Now ROUGE (after v1.4) will use all references without this
# internal Jackknifing procedure. The speed of the evaluation should improve
# a lot, since only one set instead of four sets of computation will be
# conducted.
# 1.3
# (1) Add skip bigram
# (2) Add an option to specify the number of sampling point (default is 1000)
# 1.2.3
# (1) Correct the enviroment variable option: -e. Now users can specify evironment
# variable ROUGE_EVAL_HOME using the "-e" option; previously this option is
# not active. Thanks Zhouyan Li of Concordia University, Canada pointing this
# out.
# 1.2.2
# (1) Correct confidence interval calculation for median, maximum, and minimum.
# Line 390.
# 1.2.1
# (1) Add sentence per line format input format. See files in Verify-SPL for examples.
# (2) Streamline command line arguments.
# (3) Use bootstrap resampling to estimate confidence intervals instead of using t-test
# or z-test which assume a normal distribution.
# (4) Add LCS (longest common subsequence) evaluation method.
# (5) Add WLCS (weighted longest common subsequence) evaluation method.
# (6) Add length cutoff in bytes.
# (7) Add an option to specify the longest ngram to compute. The default is 4.
# 1.2
# (1) Change zero condition check in subroutine &computeNGramScores when
# computing $gram1Score from
# if($totalGram2Count!=0) to
# if($totalGram1Count!=0)
# Thanks Ken Litkowski for this bug report.
# This original script will set gram1Score to zero if there is no
# bigram matches. This should rarely has significant affect the final score
# since (a) there are bigram matches most of time; (b) the computation
# of gram1Score is using Jackknifing procedure. However, this definitely
# did not compute the correct $gram1Score when there is no bigram matches.
# Therefore, users of version 1.1 should definitely upgrade to newer
# version of the script that does not contain this bug.
# Note: To use this script, two additional data files are needed:
# (1) smart_common_words.txt - contains stopword list from SMART IR engine
# (2) WordNet-1.6.exc.db - WordNet 1.6 exception inflexion database
# These two files have to be put in a directory pointed by the environment
# variable: "ROUGE_EVAL_HOME".
# If environment variable ROUGE_EVAL_HOME does not exist, this script will
# will assume it can find these two database files in the current directory.
Loading