From 39a68da85f55de999d7d868da0cc79b6381a24b7 Mon Sep 17 00:00:00 2001 From: Egor Lebedev Date: Fri, 8 Nov 2024 13:39:18 +0300 Subject: [PATCH 01/12] Add Udmurt (udm) translation literals (#381) --- .../templates/utils/translation_literals.py | 23 +++++++++++++++++++ src/lighteval/utils/language.py | 2 ++ 2 files changed, 25 insertions(+) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index 7537c38f..b00957ac 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -865,6 +865,29 @@ def __getattribute__(self, name: str) -> str: colon=":", ), Language.TURKMEN: TranslationLiterals(language=Language.TURKMEN), + Language.UDMURT: TranslationLiterals( + language=Language.UDMURT, + question_word="юан", + answer="валэктон", + confirmation_word="озьы-а", + yes="бен", + no="ӧвӧл", + also="озьы ик", + cause_word="малы ке шуоно", + effect_word="соин ик", + true="шонерлык", + false="пӧяськон", + neither="мукет", + or_word="яке", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + indices=["А", "Б", "В", "Г", "Д", "Е"], + ), Language.UKRAINIAN: TranslationLiterals( language=Language.UKRAINIAN, question_word="питання", diff --git a/src/lighteval/utils/language.py b/src/lighteval/utils/language.py index 1e9707a3..e6e53984 100644 --- a/src/lighteval/utils/language.py +++ b/src/lighteval/utils/language.py @@ -121,6 +121,7 @@ class Language(Enum): CEBUANO = "ceb" WAR = "war" SHAN = "shn" + UDMURT = "udm" # This mapping was created for beleble, it converts iso_639_3 individual codes to iso_639_3 macro codes @@ -232,6 +233,7 @@ class Language(Enum): "ars": Language.ARABIC, "bul": Language.BULGARIAN, "est": Language.ESTONIAN, + "udm": Language.UDMURT, # 'hau': Language.HAUSA, "ind": Language.INDONESIAN, # 'kea': Language.KABUVERDIANU, From 78d978547b98275348675661c7a59e46bb6f67c3 Mon Sep 17 00:00:00 2001 From: Kryvich <44714498+Kryuski@users.noreply.github.com> Date: Fri, 8 Nov 2024 15:09:39 +0300 Subject: [PATCH 02/12] This PR adds translation literals for Belarusian language. (#382) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- .../templates/utils/translation_literals.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index b00957ac..cb77dc16 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -149,7 +149,29 @@ def __getattribute__(self, name: str) -> str: colon=":", semicolon=";", ), - Language.BELARUSIAN: TranslationLiterals(language=Language.BELARUSIAN), + Language.BELARUSIAN: TranslationLiterals( + language=Language.BELARUSIAN, + question_word="пытанне", + answer="адказ", + confirmation_word="ці не так", + yes="так", + no="не", + also="апроч таго", + cause_word="бо", + effect_word="таму", + true="праўда", + false="няпраўда", + neither="ні тое, ні тое", + or_word="ці", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + indices=["А", "Б", "В", "Г", "Д", "Е"], + ), Language.BENGALI: TranslationLiterals(language=Language.BENGALI, question_word="প্রশ্ন"), Language.BIHARI: TranslationLiterals(language=Language.BIHARI), # Deprecated Language.BOSNIAN: TranslationLiterals(language=Language.BOSNIAN), From e51be17ec29080c914c1f260e14694086720c923 Mon Sep 17 00:00:00 2001 From: Nazim Ali Date: Fri, 8 Nov 2024 09:12:25 -0500 Subject: [PATCH 03/12] fix: cache directory variable (#378) Fixes cache directory bug by using HF_HUB_CACHE instead of HF_HOME See documentation https://huggingface.co/docs/huggingface_hub/main/en/package_reference/environment_variables#hfhubcache --- src/lighteval/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/utils/utils.py b/src/lighteval/utils/utils.py index 1cc51955..2c0ea487 100644 --- a/src/lighteval/utils/utils.py +++ b/src/lighteval/utils/utils.py @@ -192,7 +192,7 @@ class EnvConfig: token (str): authentication token used for accessing the HuggingFace Hub. """ - cache_dir: str = os.getenv("HF_HOME", "/scratch") + cache_dir: str = os.getenv("HF_HUB_CACHE", "/scratch") token: str = os.getenv("HF_TOKEN") From 2f03df010b95725867eb9cba25731e8f0e53dc07 Mon Sep 17 00:00:00 2001 From: vsabolcec <60775189+vsabolcec@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:24:49 +0100 Subject: [PATCH 04/12] greedy_until() fix (#344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/models/nanotron_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py index 40c8f237..0814acac 100644 --- a/src/lighteval/models/nanotron_model.py +++ b/src/lighteval/models/nanotron_model.py @@ -1133,7 +1133,7 @@ def greedy_until( else: # Longest context in the current split is the first item (since we sort reversed) context_enc = dataset[0][1].tokenized_context - max_gen = max(item[1].generation_size for item in dataset) + max_gen = max(item.generation_size for item in dataset) max_input_length = min(len(context_enc) + max_gen, self.max_length) batch_size = self._get_batch_size( From 853e10be40e93976362ef41e320665c0e20facf5 Mon Sep 17 00:00:00 2001 From: Dmitry Gaynullin <117692111+gaydmi@users.noreply.github.com> Date: Wed, 13 Nov 2024 09:07:50 +0100 Subject: [PATCH 05/12] added tatar literals (#383) --------- Co-authored-by: Dmitry Gaynullin --- .../templates/utils/translation_literals.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index cb77dc16..ef5194f3 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -817,7 +817,29 @@ def __getattribute__(self, name: str) -> str: cause_word="காரணமாக", effect_word="எனவே", ), - Language.TATAR: TranslationLiterals(language=Language.TATAR), + Language.TATAR: TranslationLiterals( + language=Language.TATAR, + question_word="сорау", + answer="җавап", + confirmation_word="шулай түгелме", + yes="әйе", + no="юк", + also="шулай ук", + cause_word="чөнки", + effect_word="шуңа күрә", + or_word="яки", + true="дөрес", + false="ялган", + neither="бер генә дә", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + indices=["А", "Б", "В", "Г", "Д", "Е"], + ), Language.TELUGU: TranslationLiterals( language=Language.TELUGU, question_word="ప్రశ్న", From 9be2a38797b0aef46f265e477b2aa966166f4b87 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Wed, 13 Nov 2024 09:49:44 -0800 Subject: [PATCH 06/12] Fixes a TypeError for generative metrics. (#386) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/metrics/sample_preparator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py index ad709033..dc32d95c 100644 --- a/src/lighteval/metrics/sample_preparator.py +++ b/src/lighteval/metrics/sample_preparator.py @@ -55,6 +55,7 @@ class PerplexityCorpusMetricInput(CorpusMetricInput): class GenerativePreparator: + @staticmethod def prepare(golds: list[str], predictions: list[str], **kwargs): """Prepares an individual generative example to the format expected by metrics computed at the corpus level (aggregated). From 1faa3b2c63f0d2885f86479b644f2aa1e62ae282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Fri, 15 Nov 2024 14:27:28 +0100 Subject: [PATCH 07/12] Update README.md --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 22afd60d..4e03b14c 100644 --- a/README.md +++ b/README.md @@ -104,9 +104,17 @@ Harness and HELM teams for their pioneering work on LLM evaluations. Got ideas? Found a bug? Want to add a [task](https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task) or [metric](https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric)? -Contributions are warmly -welcomed! +Contributions are warmly welcomed! +If you're adding a new feature, please open an issue first. + +If you open a PR, don't forget to run the styling! + +```bash +pip install -e .[dev] +pre-commit install +pre-commit run --all-files +``` ## 📜 Citation ```bibtex From 9b43560faa72ee90cf7ac150e1faaf129cdda677 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Mon, 18 Nov 2024 01:13:08 -0800 Subject: [PATCH 08/12] Fixes a TypeError in Sacrebleu. (#387) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- .github/ISSUE_TEMPLATE/evaluation-task-request.md | 4 ++-- .github/ISSUE_TEMPLATE/feature-request.md | 1 - README.md | 2 +- examples/model_configs/peft_model.yaml | 4 ++-- examples/model_configs/quantized_model.yaml | 4 ++-- src/lighteval/metrics/metrics_corpus.py | 10 +++++++++- 6 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/evaluation-task-request.md b/.github/ISSUE_TEMPLATE/evaluation-task-request.md index 38310bc1..4b890858 100644 --- a/.github/ISSUE_TEMPLATE/evaluation-task-request.md +++ b/.github/ISSUE_TEMPLATE/evaluation-task-request.md @@ -13,6 +13,6 @@ assignees: '' ## Evaluation metadata Provide all available -- Paper url: -- Github url: +- Paper url: +- Github url: - Dataset url: diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md index f8a597fe..801b1047 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.md +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -15,4 +15,3 @@ A clear and concise description of what you want to happen. ## Posssible alternatives A clear and concise description of any alternative solutions or features you've considered. - diff --git a/README.md b/README.md index 4e03b14c..f554ed17 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Harness and HELM teams for their pioneering work on LLM evaluations. Got ideas? Found a bug? Want to add a [task](https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task) or [metric](https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric)? -Contributions are warmly welcomed! +Contributions are warmly welcomed! If you're adding a new feature, please open an issue first. diff --git a/examples/model_configs/peft_model.yaml b/examples/model_configs/peft_model.yaml index 200542ae..d94ff610 100644 --- a/examples/model_configs/peft_model.yaml +++ b/examples/model_configs/peft_model.yaml @@ -1,8 +1,8 @@ model: - type: "base" + type: "base" base_params: model_args: "pretrained=predibase/customer_support,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied. - dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. + dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. compile: true merged_weights: # Ignore this section if you are not using PEFT models delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name diff --git a/examples/model_configs/quantized_model.yaml b/examples/model_configs/quantized_model.yaml index 5b69de95..dfac1c95 100644 --- a/examples/model_configs/quantized_model.yaml +++ b/examples/model_configs/quantized_model.yaml @@ -1,8 +1,8 @@ model: - type: "base" + type: "base" base_params: model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... - dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. + dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. compile: true merged_weights: # Ignore this section if you are not using PEFT models delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 59b9ecea..1286ab08 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -30,6 +30,7 @@ import sacrebleu import sklearn.metrics +from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.metrics.sample_preparator import ( GenerativeCorpusMetricInput, LogprobCorpusMetricInput, @@ -103,7 +104,14 @@ def __init__(self, metric_type: str): def compute(self, items: list[GenerativeCorpusMetricInput]) -> float: """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation.""" golds = [i.golds for i in items] - preds = [as_list(i.preds) for i in items] + preds = [] + for i in items: + pred = as_list(i.preds) + if len(pred) > 1: + hlog_warn( + f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{self.metric.__name__})." + ) + preds.append(pred[0]) return float(self.metric(hypotheses=preds, references=golds).score) From fd8a68201b41070e78439eeee91ae2de74a4142c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= Date: Mon, 18 Nov 2024 09:42:30 -0400 Subject: [PATCH 09/12] Adds template for translation tasks (#391) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * implement tranlsation prompt * add small coment about tranlsation prompt * change formatting to reformat language dependant parts --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/tasks/templates/continuation.py | 10 +- src/lighteval/tasks/templates/translation.py | 156 ++++++++++++++++++ tests/tasks/templates/test_translation.py | 120 ++++++++++++++ 3 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 src/lighteval/tasks/templates/translation.py create mode 100644 tests/tasks/templates/test_translation.py diff --git a/src/lighteval/tasks/templates/continuation.py b/src/lighteval/tasks/templates/continuation.py index 84c11230..6435fc8f 100644 --- a/src/lighteval/tasks/templates/continuation.py +++ b/src/lighteval/tasks/templates/continuation.py @@ -86,6 +86,7 @@ def get_continuation_prompt_function( language: Language, adapter: Callable[[dict], ContinuationInput | None] | ContinuationDictAdapter, formulation: Formulation = MCFFormulation(), + fix_formatting: bool = True, ): """ Create a templated prompt function for a Continuation task. @@ -118,6 +119,7 @@ def get_continuation_prompt_function( adapter (Callable[[dict], ContinuationInput] | ContinuationDictAdapter): Either a function that takes a dataset row and returns a ContinuationInput, or a dictionary with keys corresponding to the field names in the dataset row. Note: Both ContinuationDictAdapter and ContinuationInput are TypeDicts, this means that the caller provides dictionary and doesn't initialize any class! formulation (Formulation, optional): The formulation (MCF/Hybrid/CF) to use for the task. Defaults to MCFFormulation(). + fix_formatting (bool, optional): Whether to fix the formatting of the text by capitalizing and fixing punctuation based on language. If False, the text will be used as-is. Defaults to True. Returns: Callable: A function that generates Continuation prompt based on the given parameters. """ @@ -132,10 +134,16 @@ def prepare_prompt(line: dict): instruction_val = cont_input.get("instruction") instruction = f"{instruction_val}\n" if instruction_val else "" - context = f"{capitalize(fix_ending_punct(cont_input['context'], translation_literals))}" + context = ( + f"{capitalize(fix_ending_punct(cont_input['context'], translation_literals))}" + if fix_formatting + else cont_input["context"] + ) continuations = [ fix_capitalization(context, fix_ending_punct(continuation, translation_literals), translation_literals) + if fix_formatting + else continuation for continuation in cont_input["continuations"] ] diff --git a/src/lighteval/tasks/templates/translation.py b/src/lighteval/tasks/templates/translation.py new file mode 100644 index 00000000..c90b99e0 --- /dev/null +++ b/src/lighteval/tasks/templates/translation.py @@ -0,0 +1,156 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from typing import Callable + +from langcodes import standardize_tag +from typing_extensions import NotRequired, TypedDict + +from lighteval.tasks.templates.continuation import get_continuation_prompt_function +from lighteval.tasks.templates.multichoice import create_adapter_from_dict +from lighteval.tasks.templates.utils.formatting_utils import capitalize, fix_ending_punct +from lighteval.tasks.templates.utils.formulation import Formulation, MCFFormulation +from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS +from lighteval.utils.language import Language +from lighteval.utils.utils import as_list + + +# Template chosen so that it's not very language-dependent, as it's not clear whether one should use the target or source language. +# It's also the best template based on https://arxiv.org/pdf/2301.07069. + + +TRANSLATION_CONTEXT = "{source_label}{colon}{sentence_space}{source_text}{sentence_space}{target_label}{colon}" + + +# Defined for type hinting only +class TranslationInput(TypedDict): + """ + Input for the Translation task. + Args: + source_text: The source text to be translated + target_text: The target text to be translated + instruction (optional): The instruction of the Translation task (e.g. Translate the following text to Turkish) + """ + + source_text: str + target_text: str | list[str] + gold_idx: NotRequired[int | list[int]] + instruction: NotRequired[str] + + +class TranslationAdapter(TypedDict): + """ + Adapter for mapping from the dataset row into the TranslationInput format. + Args: + source_text: Column name in the row that contains the source text to be translated + target_text: Column name in the row that contains the target text to be translated + instruction (optional): Column name in the row that contains the instruction of the task (e.g. Translate the following text to Turkish) + """ + + source_text: str + target_text: str + gold_idx: NotRequired[int | list[int]] + instruction: NotRequired[str] + + +def get_translation_prompt_function( + source_language: Language, + target_language: Language, + adapter: Callable[[dict], TranslationInput | None] | TranslationAdapter, + formulation: Formulation = MCFFormulation(), +): + """ + Create a templated prompt function for a Translation task. + Example tasks: + - WMT2016 + - WMT2017 + + Format: + *CF* + EN: How are you? TR: | Nasılsın? + + *Hybrid* + EN: How are you? TR: + A. Nasılsın? + B. Jak se máš? + Answer: | Nasılsın?/Jak se máš? + + *MCF* + EN: How are you? TR: + A. Nasılsın? + B. Jak se máš? + Answer: | A/B + + Args: + adapter (Callable[[dict], TranslationInput] | TranslationAdapter): Either a function that takes a dataset row and returns a TranslationInput, or a dictionary with keys corresponding to the field names in the dataset row. + Note: Both TranslationAdapter and TranslationInput are TypeDicts, this means that the caller provides dictionary and doesn't initialize any class! + formulation (Formulation, optional): The formulation to use for the task. Defaults to MCFFormulation(). + Returns: + Callable: A function that generates Translation prompts based on the given parameters. + """ + adapter_fn = create_adapter_from_dict(adapter) + continuation_prompt_fn = get_continuation_prompt_function( + Language.ENGLISH, + {"context": "context", "continuations": "continuations", "gold_idx": "gold_idx"}, + formulation, + fix_formatting=False, + ) + source_translation_literals = TRANSLATION_LITERALS[source_language] + target_translation_literals = TRANSLATION_LITERALS[target_language] + + source_label_string = standardize_tag(source_language.value).upper() + target_label_string = standardize_tag(target_language.value).upper() + + def translation_prompt( + line: dict, + task_name: str, + ): + input_data = adapter_fn(line) + if input_data is None: + return None + + source_text = capitalize(fix_ending_punct(input_data["source_text"], source_translation_literals)) + + context = TRANSLATION_CONTEXT.format( + source_label=source_label_string, + source_text=source_text, + target_label=target_label_string, + colon=":", + sentence_space=" ", + ) + + continuations = [ + capitalize(fix_ending_punct(text, target_translation_literals)) + for text in as_list(input_data["target_text"]) + ] + + return continuation_prompt_fn( + { + "instruction": input_data.get("instruction", ""), + "context": context, + "continuations": continuations, + "gold_idx": input_data.get("gold_idx", list(range(len(continuations)))), + }, + task_name, + ) + + return translation_prompt diff --git a/tests/tasks/templates/test_translation.py b/tests/tasks/templates/test_translation.py new file mode 100644 index 00000000..eab59cf1 --- /dev/null +++ b/tests/tasks/templates/test_translation.py @@ -0,0 +1,120 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from lighteval.tasks.templates.translation import get_translation_prompt_function +from lighteval.tasks.templates.utils.formulation import CFFormulation, MCFFormulation +from lighteval.utils.language import Language + + +def test_translation_prompt_cf(): + """ + Tests that translation prompt function works correctly for CF formulation. + """ + test_input = { + "source_text": "Ahoj, jak se máš?", + "target_text": "Bonjour, comment allez-vous?", + } + + prompt_fn = get_translation_prompt_function( + source_language=Language.CZECH, + target_language=Language.FRENCH, + adapter=lambda x: { + "source_text": x["source_text"], + "target_text": x["target_text"], + }, + formulation=CFFormulation(), + ) + + doc = prompt_fn(test_input, "test_task") + assert doc is not None + + assert doc.query == "CS: Ahoj, jak se máš? FR:" + assert doc.unconditioned_query == "" + assert doc.choices == [" Bonjour, comment allez-vous?"] + assert doc.gold_index == [0] + + +def test_translation_prompt_mcf(): + """ + Tests that translation prompt function works correctly for MCF formulation. + """ + test_input = { + "source_text": "Ahoj, jak se máš?", + "target_text": ["Bonjour, comment allez-vous?", "Ciao, come stai?"], + } + + prompt_fn = get_translation_prompt_function( + source_language=Language.CZECH, + target_language=Language.FRENCH, + adapter=lambda x: { + "source_text": x["source_text"], + "target_text": x["target_text"], + "gold_idx": 0, + }, + formulation=MCFFormulation(), + ) + + doc = prompt_fn(test_input, "test_task") + assert doc is not None + + assert ( + doc.query + == """\ +CS: Ahoj, jak se máš? FR: + A. Bonjour, comment allez-vous? + B. Ciao, come stai? +Answer:\ +""" + ) + assert doc.unconditioned_query == "Answer:" + assert doc.choices == [" A", " B"] + assert doc.gold_index == [0] + + +def test_translation_prompt_cf_formatting(): + """ + Tests that translation prompt function works correctly for CF formulation with formatting. + """ + test_input = { + "source_text": "How are you?", + "target_text": ["你好吗?"], + } + + prompt_fn = get_translation_prompt_function( + source_language=Language.ENGLISH, + target_language=Language.CHINESE, + adapter=lambda x: { + "source_text": x["source_text"], + "target_text": x["target_text"], + "gold_idx": 0, + }, + formulation=CFFormulation(), + ) + + doc = prompt_fn(test_input, "test_task") + assert doc is not None + + assert doc.query == "EN: How are you? ZH:" + assert doc.unconditioned_query == "" + assert doc.choices == [" 你好吗?"] + assert doc.gold_index == [0] From db609f791fc790e940d247534c7632bcc58771f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Tue, 19 Nov 2024 08:59:11 +0100 Subject: [PATCH 10/12] Use the programmatic interface using an already in memory loaded model (#390) --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- src/lighteval/logging/hierarchical_logger.py | 3 + src/lighteval/models/adapter_model.py | 9 +- src/lighteval/models/base_model.py | 109 +++++++++++++++++-- src/lighteval/pipeline.py | 12 +- 4 files changed, 121 insertions(+), 12 deletions(-) diff --git a/src/lighteval/logging/hierarchical_logger.py b/src/lighteval/logging/hierarchical_logger.py index 99287f75..1c4c3a11 100644 --- a/src/lighteval/logging/hierarchical_logger.py +++ b/src/lighteval/logging/hierarchical_logger.py @@ -34,8 +34,11 @@ logger = get_logger(__name__, log_level="INFO") elif is_accelerate_available(): + from accelerate import Accelerator, InitProcessGroupKwargs from accelerate.logging import get_logger + # We must init the accelerator before using the logger + accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) logger = get_logger(__name__, log_level="INFO") else: logger = Logger(__name__, level="INFO") diff --git a/src/lighteval/models/adapter_model.py b/src/lighteval/models/adapter_model.py index 18fd6890..dbf762d7 100644 --- a/src/lighteval/models/adapter_model.py +++ b/src/lighteval/models/adapter_model.py @@ -41,7 +41,14 @@ class AdapterModel(BaseModel): def _create_auto_tokenizer(self, config: AdapterModelConfig, env_config: EnvConfig) -> PreTrainedTokenizer: # By default, we look at the model config for the model stored in `base_model` # (= the parent model, not the model of interest) - return self._create_auto_tokenizer_with_name(config.base_model, config=config, env_config=env_config) + return self._create_auto_tokenizer_with_name( + model_name=config.base_model, + revision=config.revision, + env_config=env_config, + tokenizer_name=config.tokenizer, + subfolder=config.subfolder, + trust_remote_code=config.trust_remote_code, + ) def _create_auto_model(self, config: AdapterModelConfig, env_config: EnvConfig) -> AutoModelForCausalLM: """Returns a PeftModel from a base model and a version fined tuned using PEFT.""" diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index debec448..993978d5 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -30,6 +30,7 @@ from torch.utils.data import DataLoader from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset, LoglikelihoodSingleTokenDataset from lighteval.logging.hierarchical_logger import hlog, hlog_err, hlog_warn @@ -57,6 +58,7 @@ if is_accelerate_available(): + from accelerate import Accelerator from accelerate.utils import calculate_maximum_sizes, convert_bytes, get_max_memory os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -67,8 +69,8 @@ class BaseModel(LightevalModel): def __init__( self, - config: BaseModelConfig, env_config: EnvConfig, + config: BaseModelConfig, ): """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.""" self._config = config.init_configs(env_config) @@ -114,6 +116,72 @@ def __init__( self.pairwise_tokenization = config.pairwise_tokenization + @classmethod + def from_model( + cls, + model: Union[AutoModelForCausalLM, LightevalModel], + env_config: EnvConfig, + accelerator: "Accelerator" = None, + tokenizer_name: str = None, # custom tokenizer + trust_remote_code: bool = False, + use_chat_template: bool = False, + add_special_tokens: bool = True, + pairwise_tokenization: bool = False, + multichoice_continuations_start_space: bool = None, + ): + # Slightly hackish way to test if the model is a AutoModelForCausalLM, since the instances don't + # derive from this class explicitely + assert isinstance(model, LightevalModel) or type(model).__name__ in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values() + + if isinstance(model, LightevalModel): + return model + + # Instanciate the object without using __init__ + self = cls.__new__(cls) + self._config = model.config + self._max_length = self._init_max_length(max_length=model.config.max_length) + self._tokenizer = self._create_auto_tokenizer_with_name( + model_name=model.name_or_path, + revision=model.config._commit_hash, + env_config=env_config, + trust_remote_code=trust_remote_code, + tokenizer_name=tokenizer_name, + ) + self.model_name = _simplify_name(model.name_or_path) + self.model_sha = model.config._commit_hash + + # If model_parallel is not set we compare the number of processes with the number of GPUs + self.model = model + self.model.eval() + torch.set_grad_enabled(False) + + self.accelerator = accelerator + if accelerator is not None: + self._device = accelerator.device + self.model = self.accelerator.prepare(self.model.to(accelerator.device)) + else: + self._device = "cpu" + + self.use_chat_template = use_chat_template + self._add_special_tokens = add_special_tokens if add_special_tokens is not None else False + self.pairwise_tokenization = pairwise_tokenization + self.multichoice_continuations_start_space = multichoice_continuations_start_space + + self.precision = _get_dtype(model.dtype, config=self._config) + + if is_accelerate_available(): + model_size, _ = calculate_maximum_sizes(self.model) + model_size = convert_bytes(model_size) + else: + model_size = -1 + self.model_info = ModelInfo( + model_name=self.model_name, + model_sha=self.model_sha, + model_dtype=self.precision, + model_size=model_size, + ) + return self + @property def tokenizer(self): return self._tokenizer @@ -207,10 +275,23 @@ def _create_auto_model(self, config: BaseModelConfig, env_config: EnvConfig) -> def _create_auto_tokenizer( self, config: BaseModelConfig, env_config: EnvConfig ) -> transformers.PreTrainedTokenizer: - return self._create_auto_tokenizer_with_name(config.pretrained, config=config, env_config=env_config) + return self._create_auto_tokenizer_with_name( + model_name=config.pretrained, + revision=config.revision, + env_config=env_config, + tokenizer_name=config.tokenizer, + subfolder=config.subfolder, + trust_remote_code=config.trust_remote_code, + ) def _create_auto_tokenizer_with_name( - self, model_name: str, config: BaseModelConfig, env_config: EnvConfig + self, + model_name: str, + revision: str, + env_config: EnvConfig, + tokenizer_name: str = None, + subfolder: str = None, + trust_remote_code: bool = False, ) -> transformers.PreTrainedTokenizer: """ Create a Hugging Face AutoTokenizer for language model. @@ -231,25 +312,35 @@ def _create_auto_tokenizer_with_name( """ try: tokenizer = AutoTokenizer.from_pretrained( - model_name if config.tokenizer is None else config.tokenizer, - revision=config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""), + model_name if tokenizer_name is None else tokenizer_name, + revision=revision + (f"/{subfolder}" if subfolder is not None else ""), cache_dir=env_config.cache_dir, token=env_config.token, - trust_remote_code=config.trust_remote_code, + trust_remote_code=trust_remote_code, padding_side="left", truncation_side="left", ) except RecursionError: tokenizer = AutoTokenizer.from_pretrained( - model_name if config.tokenizer is None else config.tokenizer, - revision=config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""), + model_name if tokenizer_name is None else tokenizer_name, + revision=revision + (f"/{subfolder}" if subfolder is not None else ""), cache_dir=env_config.cache_dir, token=env_config.token, - trust_remote_code=config.trust_remote_code, + trust_remote_code=trust_remote_code, unk_token="", padding_side="left", truncation_side="left", ) + except FileNotFoundError: + hlog_warn("Problem when loading the tokenizer in the cache - discarding the provided cache path value.") + tokenizer = AutoTokenizer.from_pretrained( + model_name if tokenizer_name is None else tokenizer_name, + revision=revision + (f"/{subfolder}" if subfolder is not None else ""), + token=env_config.token, + trust_remote_code=trust_remote_code, + padding_side="left", + truncation_side="left", + ) tokenizer.pad_token = tokenizer.eos_token tokenizer.model_max_length = self.max_length hlog("Tokenizer truncation and padding size set to the left side.") diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index db0ede47..da4fb045 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -34,7 +34,7 @@ from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.logging.hierarchical_logger import hlog, htrack_block from lighteval.metrics.utils.metric_utils import MetricCategory -from lighteval.models.model_loader import load_model +from lighteval.models.model_loader import BaseModel, load_model from lighteval.models.model_output import ModelResponse from lighteval.tasks.lighteval_task import LightevalTask, create_requests_from_tasks from lighteval.tasks.registry import Registry, taskinfo_selector @@ -164,7 +164,15 @@ def _init_model(self, model_config, model): ) else: return load_model(config=model_config, env_config=self.pipeline_parameters.env_config) - return model + if isinstance(model, BaseModel): + return model + else: + return BaseModel.from_model( + model=model, + use_chat_template=self.pipeline_parameters.use_chat_template, + env_config=self.pipeline_parameters.env_config, + accelerator=self.accelerator, + ) def _init_tasks_and_requests(self, tasks: str): with htrack_block("Tasks loading"): From c173871b4eefddc4bb02574305db1759ffaaeb55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= Date: Tue, 19 Nov 2024 08:30:03 -0400 Subject: [PATCH 11/12] fix ukr/rus (#394) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/tasks/templates/utils/translation_literals.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index ef5194f3..186d6448 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -631,7 +631,7 @@ def __getattribute__(self, name: str) -> str: language=Language.RUSSIAN, question_word="вопрос", answer="ответ", - confirmation_word="не так ли", + confirmation_word="верно", yes="да", no="нет", also="к тому же", @@ -936,7 +936,7 @@ def __getattribute__(self, name: str) -> str: language=Language.UKRAINIAN, question_word="питання", answer="відповідь", - confirmation_word="правда", + confirmation_word="вірно", yes="так", no="ні", also="також", From 85c0d9f0fb5106d834666fce6867e28d02a9b4e6 Mon Sep 17 00:00:00 2001 From: Anton Lozhkov Date: Wed, 20 Nov 2024 11:08:18 +0100 Subject: [PATCH 12/12] fix repeated cleanup (#399) Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- src/lighteval/models/vllm_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lighteval/models/vllm_model.py b/src/lighteval/models/vllm_model.py index 22051e8f..dc242c60 100644 --- a/src/lighteval/models/vllm_model.py +++ b/src/lighteval/models/vllm_model.py @@ -98,7 +98,8 @@ def tokenizer(self): def cleanup(self): destroy_model_parallel() - del self.model.llm_engine.model_executor.driver_worker + if self.model is not None: + del self.model.llm_engine.model_executor.driver_worker self.model = None gc.collect() ray.shutdown()