From 9b43560faa72ee90cf7ac150e1faaf129cdda677 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Mon, 18 Nov 2024 01:13:08 -0800 Subject: [PATCH] Fixes a TypeError in Sacrebleu. (#387) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- .github/ISSUE_TEMPLATE/evaluation-task-request.md | 4 ++-- .github/ISSUE_TEMPLATE/feature-request.md | 1 - README.md | 2 +- examples/model_configs/peft_model.yaml | 4 ++-- examples/model_configs/quantized_model.yaml | 4 ++-- src/lighteval/metrics/metrics_corpus.py | 10 +++++++++- 6 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/evaluation-task-request.md b/.github/ISSUE_TEMPLATE/evaluation-task-request.md index 38310bc1..4b890858 100644 --- a/.github/ISSUE_TEMPLATE/evaluation-task-request.md +++ b/.github/ISSUE_TEMPLATE/evaluation-task-request.md @@ -13,6 +13,6 @@ assignees: '' ## Evaluation metadata Provide all available -- Paper url: -- Github url: +- Paper url: +- Github url: - Dataset url: diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md index f8a597fe..801b1047 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.md +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -15,4 +15,3 @@ A clear and concise description of what you want to happen. ## Posssible alternatives A clear and concise description of any alternative solutions or features you've considered. - diff --git a/README.md b/README.md index 4e03b14c..f554ed17 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Harness and HELM teams for their pioneering work on LLM evaluations. Got ideas? Found a bug? Want to add a [task](https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task) or [metric](https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric)? -Contributions are warmly welcomed! +Contributions are warmly welcomed! If you're adding a new feature, please open an issue first. diff --git a/examples/model_configs/peft_model.yaml b/examples/model_configs/peft_model.yaml index 200542ae..d94ff610 100644 --- a/examples/model_configs/peft_model.yaml +++ b/examples/model_configs/peft_model.yaml @@ -1,8 +1,8 @@ model: - type: "base" + type: "base" base_params: model_args: "pretrained=predibase/customer_support,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied. - dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. + dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. compile: true merged_weights: # Ignore this section if you are not using PEFT models delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name diff --git a/examples/model_configs/quantized_model.yaml b/examples/model_configs/quantized_model.yaml index 5b69de95..dfac1c95 100644 --- a/examples/model_configs/quantized_model.yaml +++ b/examples/model_configs/quantized_model.yaml @@ -1,8 +1,8 @@ model: - type: "base" + type: "base" base_params: model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... - dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. + dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. compile: true merged_weights: # Ignore this section if you are not using PEFT models delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 59b9ecea..1286ab08 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -30,6 +30,7 @@ import sacrebleu import sklearn.metrics +from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.metrics.sample_preparator import ( GenerativeCorpusMetricInput, LogprobCorpusMetricInput, @@ -103,7 +104,14 @@ def __init__(self, metric_type: str): def compute(self, items: list[GenerativeCorpusMetricInput]) -> float: """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation.""" golds = [i.golds for i in items] - preds = [as_list(i.preds) for i in items] + preds = [] + for i in items: + pred = as_list(i.preds) + if len(pred) > 1: + hlog_warn( + f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{self.metric.__name__})." + ) + preds.append(pred[0]) return float(self.metric(hypotheses=preds, references=golds).score)