diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py index 81c0ff0df0..b44448fd30 100644 --- a/lm_eval/api/model.py +++ b/lm_eval/api/model.py @@ -392,7 +392,6 @@ def __init__(self, lm: LM, cache_db: str): Path to the `cache` database. """ from sqlitedict import SqliteDict - self.lm = lm if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True) diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index 4f517c7604..8fdaf8f5f9 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -237,12 +237,12 @@ def evaluate( metrics, example = output example.update(fewshot_logging_info) example.update(task.get_logging_info()) - example_logger.info(json.dumps(example)) + example_logger.info(json.dumps(example, ensure_ascii=False)) else: metrics = output example = fewshot_logging_info example.update(task.get_logging_info()) - example_logger.info(json.dumps(example)) + example_logger.info(json.dumps(example, ensure_ascii=False)) for metric, value in metrics.items(): vals[(task_template_key, metric)].append(value) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index e53311ab75..becf780bb9 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -25,6 +25,7 @@ from . import jigsaw_unintended_bias from . import lama from . import lince +from . import multi_eurlex from . import piaf from . import race from . import schema_guided_dstc8 @@ -134,6 +135,8 @@ # WMT # Format: `wmt{year}_{lang1}_{lang2}` **wmt.construct_tasks(), + # MultiEURLEX + "multi_eurlex_mt": multi_eurlex.MultiEURLEXMT, # BLiMP "blimp_adjunct_island": blimp.BlimpAdjunctIsland, "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement, @@ -209,10 +212,10 @@ # TODO: Not Yet Available in `promptsource/eval-hackathon` ######################################################## # GEM/mlsum - # "mlsum_es": gem_mlsum.GEMMLSUMEs, - # "mlsum_de": gem_mlsum.GEMMLSUMDe, - # "mlsum_es_covid_challenge_set": gem_mlsum.GEMMLSUMEsChallgeTestCovid, - # "mlsum_de_covid_challenge_set": gem_mlsum.GEMMLSUMDeChallgeTestCovid, + "mlsum_es": gem_mlsum.GEMMLSUMEs, + "mlsum_de": gem_mlsum.GEMMLSUMDe, + "mlsum_es_covid_challenge_set": gem_mlsum.GEMMLSUMEsChallgeTestCovid, + "mlsum_de_covid_challenge_set": gem_mlsum.GEMMLSUMDeChallgeTestCovid, # LAMA # "bigscience-lama": lama.BigScienceLAMA, ######################################################## diff --git a/lm_eval/tasks/multi_eurlex.py b/lm_eval/tasks/multi_eurlex.py new file mode 100644 index 0000000000..b03c8d0990 --- /dev/null +++ b/lm_eval/tasks/multi_eurlex.py @@ -0,0 +1,65 @@ +""" +MultiEURLEX +""" +from typing import Dict, List, Optional +from lm_eval.api.task import PromptSourceTask + +_CITATION = """ +""" + +_LANGUAGES = [ + "en", + "da", + "de", + "nl", + "sv", + "bg", + "cs", + "hr", + "pl", + "sk", + "sl", + "es", + "fr", + "it", + "pt", + "ro", + "et", + "fi", + "hu", + "lt", + "lv", + "el", + "mt", +] + +class MultiEURLEXMT(PromptSourceTask): + DATASET_PATH = "multi_eurlex" + DATASET_NAME = "all_languages" + VERSION = 0 + + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + if self.has_training_docs(): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + return self.dataset["validation"] + + def test_docs(self): + if self.has_test_docs(): + return self.dataset["test"] + + def max_generation_length(self) -> Optional[int]: + return 1024 + + diff --git a/main.py b/main.py index c2182079b9..0bdbe41041 100644 --- a/main.py +++ b/main.py @@ -180,7 +180,6 @@ def main(): results = evaluator.cli_evaluate(**evaluate_args) else: from codecarbon import OfflineEmissionsTracker - with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"): print() # Add newline between emissions tracker and evaluation logging. results = evaluator.cli_evaluate(**evaluate_args)