From 9b43560faa72ee90cf7ac150e1faaf129cdda677 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 18 Nov 2024 01:13:08 -0800
Subject: [PATCH] Fixes a TypeError in Sacrebleu. (#387)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---------

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
---
 .github/ISSUE_TEMPLATE/evaluation-task-request.md |  4 ++--
 .github/ISSUE_TEMPLATE/feature-request.md         |  1 -
 README.md                                         |  2 +-
 examples/model_configs/peft_model.yaml            |  4 ++--
 examples/model_configs/quantized_model.yaml       |  4 ++--
 src/lighteval/metrics/metrics_corpus.py           | 10 +++++++++-
 6 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/evaluation-task-request.md b/.github/ISSUE_TEMPLATE/evaluation-task-request.md
index 38310bc1..4b890858 100644
--- a/.github/ISSUE_TEMPLATE/evaluation-task-request.md
+++ b/.github/ISSUE_TEMPLATE/evaluation-task-request.md
@@ -13,6 +13,6 @@ assignees: ''
 
 ## Evaluation metadata
 Provide all available
-- Paper url: 
-- Github url: 
+- Paper url:
+- Github url:
 - Dataset url:
diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
index f8a597fe..801b1047 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.md
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -15,4 +15,3 @@ A clear and concise description of what you want to happen.
 
 ## Posssible alternatives
 A clear and concise description of any alternative solutions or features you've considered.
-
diff --git a/README.md b/README.md
index 4e03b14c..f554ed17 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ Harness and HELM teams for their pioneering work on LLM evaluations.
 Got ideas? Found a bug? Want to add a
 [task](https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task) or
 [metric](https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric)?
-Contributions are warmly welcomed! 
+Contributions are warmly welcomed!
 
 If you're adding a new feature, please open an issue first.
 
diff --git a/examples/model_configs/peft_model.yaml b/examples/model_configs/peft_model.yaml
index 200542ae..d94ff610 100644
--- a/examples/model_configs/peft_model.yaml
+++ b/examples/model_configs/peft_model.yaml
@@ -1,8 +1,8 @@
 model:
-  type: "base" 
+  type: "base"
   base_params:
     model_args: "pretrained=predibase/customer_support,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied.
-    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. 
+    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
     compile: true
   merged_weights: # Ignore this section if you are not using PEFT models
     delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
diff --git a/examples/model_configs/quantized_model.yaml b/examples/model_configs/quantized_model.yaml
index 5b69de95..dfac1c95 100644
--- a/examples/model_configs/quantized_model.yaml
+++ b/examples/model_configs/quantized_model.yaml
@@ -1,8 +1,8 @@
 model:
-  type: "base" 
+  type: "base"
   base_params:
     model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
-    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. 
+    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
     compile: true
   merged_weights: # Ignore this section if you are not using PEFT models
     delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index 59b9ecea..1286ab08 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -30,6 +30,7 @@
 import sacrebleu
 import sklearn.metrics
 
+from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics.sample_preparator import (
     GenerativeCorpusMetricInput,
     LogprobCorpusMetricInput,
@@ -103,7 +104,14 @@ def __init__(self, metric_type: str):
     def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
         """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation."""
         golds = [i.golds for i in items]
-        preds = [as_list(i.preds) for i in items]
+        preds = []
+        for i in items:
+            pred = as_list(i.preds)
+            if len(pred) > 1:
+                hlog_warn(
+                    f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{self.metric.__name__})."
+                )
+            preds.append(pred[0])
         return float(self.metric(hypotheses=preds, references=golds).score)