From 9f38b3a902eae5bb7516842407263ffa66ace1dc Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Thu, 11 Aug 2022 18:05:09 +0200 Subject: [PATCH 01/11] Activate mlsum --- lm_eval/tasks/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 11feacad2c..1dd385d79c 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -204,8 +204,8 @@ # TODO: Not Yet Available in `promptsource/eval-hackathon` ######################################################## # GEM/mlsum - # "mlsum_es": gem_mlsum.GEMMLSUMEs, - # "mlsum_de": gem_mlsum.GEMMLSUMDe, + "mlsum_es": gem_mlsum.GEMMLSUMEs, + "mlsum_de": gem_mlsum.GEMMLSUMDe, # "mlsum_es_covid_challenge_set": gem_mlsum.GEMMLSUMEsChallgeTestCovid, # "mlsum_de_covid_challenge_set": gem_mlsum.GEMMLSUMDeChallgeTestCovid, # LAMA From c02722afe7f894bffb73322b23fd8e16b40bd82b Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Fri, 12 Aug 2022 18:10:05 +0200 Subject: [PATCH 02/11] Do not force import --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 79f5d2db87..215e728a43 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,6 @@ import json import logging import os -from codecarbon import OfflineEmissionsTracker import lm_eval.evaluator as evaluator from lm_eval.api import utils @@ -178,6 +177,7 @@ def main(): bootstrap_iters=args.bootstrap_iters, ) else: + from codecarbon import OfflineEmissionsTracker with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): print() # Add newline between emissions tracker and evaluation logging. results = evaluator.cli_evaluate( From 2b87fb5bf7f1c4fd128600cfdde9cadd2a8af4ba Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Fri, 12 Aug 2022 18:12:12 +0200 Subject: [PATCH 03/11] Do not force sql import --- lm_eval/api/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py index 489c3b900f..0955db52bd 100644 --- a/lm_eval/api/model.py +++ b/lm_eval/api/model.py @@ -4,7 +4,6 @@ import os import torch import torch.nn.functional as F -from sqlitedict import SqliteDict from tqdm import tqdm from typing import Iterable, List, Optional, Tuple, Union from transformers import BatchEncoding @@ -374,6 +373,7 @@ def __init__(self, lm: LM, cache_db: str): :param cache_db: str Path to cache db """ + from sqlitedict import SqliteDict self.lm = lm if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True) From 31d557e0e15d10642c348a6cda5d9e7327f81f01 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 15 Aug 2022 21:53:50 +0200 Subject: [PATCH 04/11] Set min_length to None by default --- lm_eval/models/huggingface.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py index 00d1cead50..88ab0e861d 100644 --- a/lm_eval/models/huggingface.py +++ b/lm_eval/models/huggingface.py @@ -65,7 +65,7 @@ def __init__( subfolder: Optional[str] = None, revision: Optional[str] = "main", batch_size: Optional[int] = 1, - min_gen_toks: Optional[int] = 0, + min_length: Optional[int] = None, max_gen_toks: Optional[int] = 256, max_length: Optional[int] = None, use_accelerate: Optional[bool] = False, @@ -107,7 +107,7 @@ def __init__( self._batch_size = batch_size # TODO: Adaptive batch size self._max_gen_toks = max_gen_toks - self._min_gen_toks = min_gen_toks + self.min_length = min_length self._max_length = max_length self._config = transformers.AutoConfig.from_pretrained(pretrained) @@ -283,7 +283,7 @@ def _collate(x): responses = self._model_generate( inputs={"input_ids": input_ids, "attention_mask": attention_mask}, max_tokens=max_tokens, - min_tokens=self._min_gen_toks, + min_length=self.min_length, stop=until, ) responses = self.tok_decode(responses.tolist()) @@ -331,12 +331,10 @@ def _model_generate( self, inputs: TokenSequence, max_tokens: int, - min_tokens: int, + min_length: int = None, stop: Optional[List[str]] = None, ) -> TokenSequence: stopping_criteria = stop_sequences_criteria(self.tokenizer, stop) - # Generate at least min_tokens; min_length = prompt + gen tokens - min_length = inputs["input_ids"].shape[1] + min_tokens generations = self.model.generate( **inputs, # GPT style models require the `generate` `max_length` arg to include the @@ -483,11 +481,10 @@ def _model_generate( self, inputs: TokenSequence, max_tokens: int, - min_tokens: int, + min_length: int = None, stop: Optional[List[str]] = None, ) -> Union[TokenSequence, List[str]]: stopping_criteria = stop_sequences_criteria(self.tokenizer, stop) - min_length = inputs["input_ids"].shape[1] + min_tokens generations = self.model.generate( **inputs, min_length=min_length, From 5d1f4930946afe00c9182e44d08362b8f15fe787 Mon Sep 17 00:00:00 2001 From: jon-tow Date: Wed, 17 Aug 2022 12:12:16 -0400 Subject: [PATCH 05/11] Remove tasks unavailable in promptsource --- lm_eval/tasks/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 1dd385d79c..11feacad2c 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -204,8 +204,8 @@ # TODO: Not Yet Available in `promptsource/eval-hackathon` ######################################################## # GEM/mlsum - "mlsum_es": gem_mlsum.GEMMLSUMEs, - "mlsum_de": gem_mlsum.GEMMLSUMDe, + # "mlsum_es": gem_mlsum.GEMMLSUMEs, + # "mlsum_de": gem_mlsum.GEMMLSUMDe, # "mlsum_es_covid_challenge_set": gem_mlsum.GEMMLSUMEsChallgeTestCovid, # "mlsum_de_covid_challenge_set": gem_mlsum.GEMMLSUMDeChallgeTestCovid, # LAMA From 8a977091d47e26b49f3717335a1a7bf7f94f4255 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Wed, 17 Aug 2022 18:14:11 +0200 Subject: [PATCH 06/11] Add MultiEURLEX --- lm_eval/tasks/__init__.py | 3 ++ lm_eval/tasks/multi_eurlex.py | 65 +++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 lm_eval/tasks/multi_eurlex.py diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 1dd385d79c..4477acffae 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -25,6 +25,7 @@ from . import jigsaw_unintended_bias from . import lama from . import lince +from . import multi_eurlex from . import piaf from . import race from . import schema_guided_dstc8 @@ -129,6 +130,8 @@ # WMT # Format: `wmt{year}_{lang1}_{lang2}` **wmt.construct_tasks(), + # MultiEURLEX + "multi_eurlex_mt": multi_eurlex.MultiEURLEXMT, # BLiMP "blimp_adjunct_island": blimp.BlimpAdjunctIsland, "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement, diff --git a/lm_eval/tasks/multi_eurlex.py b/lm_eval/tasks/multi_eurlex.py new file mode 100644 index 0000000000..b03c8d0990 --- /dev/null +++ b/lm_eval/tasks/multi_eurlex.py @@ -0,0 +1,65 @@ +""" +MultiEURLEX +""" +from typing import Dict, List, Optional +from lm_eval.api.task import PromptSourceTask + +_CITATION = """ +""" + +_LANGUAGES = [ + "en", + "da", + "de", + "nl", + "sv", + "bg", + "cs", + "hr", + "pl", + "sk", + "sl", + "es", + "fr", + "it", + "pt", + "ro", + "et", + "fi", + "hu", + "lt", + "lv", + "el", + "mt", +] + +class MultiEURLEXMT(PromptSourceTask): + DATASET_PATH = "multi_eurlex" + DATASET_NAME = "all_languages" + VERSION = 0 + + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + if self.has_training_docs(): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + return self.dataset["validation"] + + def test_docs(self): + if self.has_test_docs(): + return self.dataset["test"] + + def max_generation_length(self) -> Optional[int]: + return 1024 + + From cd10264c9ca566cbbe6b78cf4e51b45bfcb809d1 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Wed, 17 Aug 2022 19:00:39 +0200 Subject: [PATCH 07/11] Unicode <3 --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 215e728a43..05e2279f05 100644 --- a/main.py +++ b/main.py @@ -138,7 +138,7 @@ def setup_example_logger(output_path): """Sets up a logger that will save each example and prediction.""" example_logger = logging.getLogger("examples") filename = f"./outputs/examples-{output_path}.jsonl" - formatter = logging.Formatter("%(message)s") + formatter = logging.Formatter(u"%(message)s") handler = logging.FileHandler(filename) handler.setFormatter(formatter) example_logger.addHandler(handler) From e5e81fa2c99f66a6826ffec0a14eda1cf7d8de03 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Wed, 17 Aug 2022 19:08:25 +0200 Subject: [PATCH 08/11] Make examples Unicode --- lm_eval/evaluator.py | 4 ++-- main.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index 9c99b93021..c1fc3dd25a 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -226,12 +226,12 @@ def evaluate( metrics, example = output example.update(fewshot_logging_info) example.update(task.get_logging_info()) - example_logger.info(json.dumps(example)) + example_logger.info(json.dumps(example, ensure_ascii=False)) else: metrics = output example = fewshot_logging_info example.update(task.get_logging_info()) - example_logger.info(json.dumps(example)) + example_logger.info(json.dumps(example, ensure_ascii=False)) for metric, value in metrics.items(): vals[(task_template_key, metric)].append(value) diff --git a/main.py b/main.py index 05e2279f05..215e728a43 100644 --- a/main.py +++ b/main.py @@ -138,7 +138,7 @@ def setup_example_logger(output_path): """Sets up a logger that will save each example and prediction.""" example_logger = logging.getLogger("examples") filename = f"./outputs/examples-{output_path}.jsonl" - formatter = logging.Formatter(u"%(message)s") + formatter = logging.Formatter("%(message)s") handler = logging.FileHandler(filename) handler.setFormatter(formatter) example_logger.addHandler(handler) From 6ea8ce267202ed1a5aa51d404931958bf1b19beb Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sat, 3 Sep 2022 20:23:24 +0200 Subject: [PATCH 09/11] Add \n --- lm_eval/api/model.py | 1 + main.py | 1 + 2 files changed, 2 insertions(+) diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py index 0955db52bd..8529c92327 100644 --- a/lm_eval/api/model.py +++ b/lm_eval/api/model.py @@ -374,6 +374,7 @@ def __init__(self, lm: LM, cache_db: str): Path to cache db """ from sqlitedict import SqliteDict + self.lm = lm if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True) diff --git a/main.py b/main.py index 0095e67523..07570bedce 100644 --- a/main.py +++ b/main.py @@ -176,6 +176,7 @@ def main(): ) else: from codecarbon import OfflineEmissionsTracker + with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): print() # Add newline between emissions tracker and evaluation logging. results = evaluator.cli_evaluate( From 8eb5f69fb7a3a72e22159327111846c3583439cc Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sat, 3 Sep 2022 20:23:52 +0200 Subject: [PATCH 10/11] Add \n --- lm_eval/api/model.py | 1 - main.py | 1 - 2 files changed, 2 deletions(-) diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py index 8529c92327..0955db52bd 100644 --- a/lm_eval/api/model.py +++ b/lm_eval/api/model.py @@ -374,7 +374,6 @@ def __init__(self, lm: LM, cache_db: str): Path to cache db """ from sqlitedict import SqliteDict - self.lm = lm if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True) diff --git a/main.py b/main.py index 07570bedce..0095e67523 100644 --- a/main.py +++ b/main.py @@ -176,7 +176,6 @@ def main(): ) else: from codecarbon import OfflineEmissionsTracker - with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): print() # Add newline between emissions tracker and evaluation logging. results = evaluator.cli_evaluate( From 2cef9519aa204ddb7e44bfd8c3b4f1e7f2b5229c Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 6 Sep 2022 17:04:58 +0200 Subject: [PATCH 11/11] Activate mlsum --- lm_eval/tasks/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index bbfd003fd5..4ac9529bdf 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -207,10 +207,10 @@ # TODO: Not Yet Available in `promptsource/eval-hackathon` ######################################################## # GEM/mlsum - # "mlsum_es": gem_mlsum.GEMMLSUMEs, - # "mlsum_de": gem_mlsum.GEMMLSUMDe, - # "mlsum_es_covid_challenge_set": gem_mlsum.GEMMLSUMEsChallgeTestCovid, - # "mlsum_de_covid_challenge_set": gem_mlsum.GEMMLSUMDeChallgeTestCovid, + "mlsum_es": gem_mlsum.GEMMLSUMEs, + "mlsum_de": gem_mlsum.GEMMLSUMDe, + "mlsum_es_covid_challenge_set": gem_mlsum.GEMMLSUMEsChallgeTestCovid, + "mlsum_de_covid_challenge_set": gem_mlsum.GEMMLSUMDeChallgeTestCovid, # LAMA # "bigscience-lama": lama.BigScienceLAMA, ########################################################