From 53beb55f7749f1cf4d5e1ed181dd0d52cc711b67 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Wed, 16 Oct 2024 14:44:28 +0300 Subject: [PATCH 01/14] initial unitxt evaluator Signed-off-by: Roni Friedman-Melamed --- my_tasks/my_task.yaml | 3 ++ my_tasks/unitxt | 1 + src/instructlab/eval/unitxt.py | 65 ++++++++++++++++++++++++++++++++++ tests/test_unitxt.py | 22 ++++++++++++ 4 files changed, 91 insertions(+) create mode 100644 my_tasks/my_task.yaml create mode 100644 my_tasks/unitxt create mode 100644 src/instructlab/eval/unitxt.py create mode 100644 tests/test_unitxt.py diff --git a/my_tasks/my_task.yaml b/my_tasks/my_task.yaml new file mode 100644 index 00000000..50d069d9 --- /dev/null +++ b/my_tasks/my_task.yaml @@ -0,0 +1,3 @@ +task: my_task +include: unitxt +recipe: card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10 \ No newline at end of file diff --git a/my_tasks/unitxt b/my_tasks/unitxt new file mode 100644 index 00000000..a933f4a1 --- /dev/null +++ b/my_tasks/unitxt @@ -0,0 +1 @@ +class: !function /Users/ronches/miniforge3/envs/lmeval/lib/python3.10/site-packages/lm_eval/tasks/unitxt/task.Unitxt \ No newline at end of file diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py new file mode 100644 index 00000000..0db39f62 --- /dev/null +++ b/src/instructlab/eval/unitxt.py @@ -0,0 +1,65 @@ +""" +Unitxt - Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI +https://github.com/IBM/unitxt +https://arxiv.org/abs/2401.14019 +""" + +# Standard +import os + +# First Party +from instructlab.eval.mmlu import MMLUBranchEvaluator + +# Local +from .logger_config import setup_logger + +logger = setup_logger(__name__) + +class UnitxtEvaluator(MMLUBranchEvaluator): + name = "unitxt" + def __init__( + self, + model_path, + tasks_dir: str, + tasks: list[str], + # unitxt_recipe: str, + ): + # tasks,tasks_dir = self.prepare_files(unitxt_recipe) + super().__init__( + model_path = model_path, + tasks_dir = tasks_dir, + tasks = tasks, + few_shots = 0 + ) + + def prepare_files(self, unitxt_recipe)->tuple: + tasks = '' + tasks_dir = '' + return tasks,tasks_dir + + def run(self,server_url: str | None = None) -> tuple: + """ + Runs evaluation + + Returns: + overall_scores Average scores for the task group + individual_scores Individual scores for each task in the task group + """ + logger.debug(locals()) + os.environ["TOKENIZERS_PARALLELISM"] = "true" + results = self._run_mmlu(server_url=server_url) + with open('my_tasks/output.txt', 'w') as f: + print(results, file=f) + taskname = self.tasks[0] + global_scores = results[taskname] + global_scores.pop('alias') + instance_scores = None + # instances = results['samples'][taskname] + # instance_scores = {} + # metrics = [metric.replace('metrics.','') for metric in instances[0]['doc']['metrics']] + # for i,instance in enumerate(instances): + # scores = {} + # for metric in metrics: + # scores[metric] = instance[metric][0] + # instance_scores[i] = scores + return global_scores,instance_scores diff --git a/tests/test_unitxt.py b/tests/test_unitxt.py new file mode 100644 index 00000000..5993429c --- /dev/null +++ b/tests/test_unitxt.py @@ -0,0 +1,22 @@ +# First Party +from instructlab.eval.unitxt import UnitxtEvaluator + + +def test_unitxt(): + print("===> Executing 'test_unitxt'...") + try: + model_path = "instructlab/granite-7b-lab" + tasks = ["my_task"] + unitxt = UnitxtEvaluator( + model_path=model_path, tasks_dir='./my_tasks/', tasks=tasks + ) + overall_score, _ = unitxt.run() + print(overall_score) + except Exception as exc: + print(f"'test_unitxt_branch' failed: {exc}") + return False + return True + + +if __name__ == "__main__": + assert test_unitxt() == True \ No newline at end of file From 19bad5972b3a923131c9e03c092ef60c5f6fd69f Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Sun, 20 Oct 2024 14:38:35 +0300 Subject: [PATCH 02/14] create unitxt files on the fly Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/mmlu.py | 7 ++- src/instructlab/eval/unitxt.py | 89 ++++++++++++++++++++++++++-------- tests/test_unitxt.py | 8 +-- 3 files changed, 77 insertions(+), 27 deletions(-) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index f893b66a..4c56686e 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -153,7 +153,7 @@ def run(self, server_url: str | None = None) -> tuple: return overall_score, individual_scores - def _run_mmlu(self, server_url: str | None = None) -> dict: + def _run_mmlu(self, server_url: str | None = None, return_all_results:bool = False) -> dict: if server_url is not None: # Requires lm_eval >= 0.4.4 model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface" @@ -177,7 +177,10 @@ def _run_mmlu(self, server_url: str | None = None) -> dict: device=self.device, task_manager=tm, ) - results = mmlu_output["results"] + if return_all_results: + results = mmlu_output + else: + results = mmlu_output["results"] return results # This method converts general errors from simple_evaluate diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index 0db39f62..226f0ffb 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -5,7 +5,12 @@ """ # Standard -import os +import os, shutil +import yaml +from uuid import uuid4 + +# Third Party +from lm_eval.tasks.unitxt import task # First Party from instructlab.eval.mmlu import MMLUBranchEvaluator @@ -16,15 +21,23 @@ logger = setup_logger(__name__) class UnitxtEvaluator(MMLUBranchEvaluator): + """ + An evaluator class, running Unitxt evaluation + + Attributes: + model_path absolute path to or name of a huggingface model + unitxt_recipe unitxt recipe (see unitxt.ai for more information) + A Recipe holds a complete specification of a unitxt pipeline + Example: card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10 + + """ name = "unitxt" def __init__( self, - model_path, - tasks_dir: str, - tasks: list[str], - # unitxt_recipe: str, + model_path, + unitxt_recipe: str, ): - # tasks,tasks_dir = self.prepare_files(unitxt_recipe) + tasks,tasks_dir = self.prepare_unitxt_files(unitxt_recipe) super().__init__( model_path = model_path, tasks_dir = tasks_dir, @@ -32,10 +45,19 @@ def __init__( few_shots = 0 ) - def prepare_files(self, unitxt_recipe)->tuple: - tasks = '' - tasks_dir = '' - return tasks,tasks_dir + def prepare_unitxt_files(self, unitxt_recipe)->tuple: + temp_task = str(uuid4()) + temp_tasks_dir = f'unitxt_temp_{temp_task}' + yaml_file = os.path.join(temp_tasks_dir,f"{temp_task}.yaml") + create_unitxt_pointer(temp_tasks_dir) + create_unitxt_yaml(yaml_file=yaml_file, unitxt_recipe=unitxt_recipe, task_name=temp_task) + return temp_task,temp_tasks_dir + + def remove_temp_files(self): + if self.tasks_dir.startswith('temp_'): #to avoid unintended deletion if this class is inherited + shutil.rmtree(self.tasks_dir) + else: + logger.warning("unitxt tasks dir did not start with 'temp_' and therefor was not deleted") def run(self,server_url: str | None = None) -> tuple: """ @@ -47,19 +69,44 @@ def run(self,server_url: str | None = None) -> tuple: """ logger.debug(locals()) os.environ["TOKENIZERS_PARALLELISM"] = "true" - results = self._run_mmlu(server_url=server_url) + results = self._run_mmlu(server_url=server_url, return_all_results=True) with open('my_tasks/output.txt', 'w') as f: print(results, file=f) taskname = self.tasks[0] - global_scores = results[taskname] + global_scores = results['results'][taskname] global_scores.pop('alias') - instance_scores = None - # instances = results['samples'][taskname] - # instance_scores = {} - # metrics = [metric.replace('metrics.','') for metric in instances[0]['doc']['metrics']] - # for i,instance in enumerate(instances): - # scores = {} - # for metric in metrics: - # scores[metric] = instance[metric][0] - # instance_scores[i] = scores + try: + instances = results['samples'][taskname] + instance_scores = {} + metrics = [metric.replace('metrics.','') for metric in instances[0]['doc']['metrics']] + for i,instance in enumerate(instances): + scores = {} + for metric in metrics: + scores[metric] = instance[metric][0] + instance_scores[i] = scores + except Exception as e: + logger.error("Error in extracting single instance scores") + logger.error(e) + logger.error(e.__traceback__) + instance_scores = None + self.remove_temp_files() return global_scores,instance_scores + + +def create_unitxt_yaml(yaml_file,unitxt_recipe, task_name): + data = { + 'task': f'{task_name}', + 'include': 'unitxt', + 'recipe': f'{unitxt_recipe}' + } + with open(yaml_file, 'w') as file: + yaml.dump(data, file, default_flow_style=False) + logger.info(f"task {task} unitxt recipe written to {yaml_file}") + +def create_unitxt_pointer(tasks_dir): + class_line = "class: !function " + task.__file__.replace("task.py", "task.Unitxt") + output_file = os.path.join(tasks_dir,'unitxt') + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, 'w') as f: + f.write(class_line) + logger.info(f"Unitxt task pointer written to {output_file}") diff --git a/tests/test_unitxt.py b/tests/test_unitxt.py index 5993429c..ce9f6b73 100644 --- a/tests/test_unitxt.py +++ b/tests/test_unitxt.py @@ -1,16 +1,16 @@ # First Party -from instructlab.eval.unitxt import UnitxtEvaluator +from instruclab.eval.unitxt import UnitxtEvaluator def test_unitxt(): print("===> Executing 'test_unitxt'...") try: model_path = "instructlab/granite-7b-lab" - tasks = ["my_task"] + unitxt_recipe = "card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10" unitxt = UnitxtEvaluator( - model_path=model_path, tasks_dir='./my_tasks/', tasks=tasks + model_path=model_path, unitxt_recipe=unitxt_recipe ) - overall_score, _ = unitxt.run() + overall_score, single_scores = unitxt.run() print(overall_score) except Exception as exc: print(f"'test_unitxt_branch' failed: {exc}") From bfc3c6ad56bff7aafb31391cb7b7fe8fea50c851 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Sun, 20 Oct 2024 14:50:06 +0300 Subject: [PATCH 03/14] typo in import Signed-off-by: Roni Friedman-Melamed --- my_tasks/my_task.yaml | 3 --- my_tasks/unitxt | 1 - tests/test_unitxt.py | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) delete mode 100644 my_tasks/my_task.yaml delete mode 100644 my_tasks/unitxt diff --git a/my_tasks/my_task.yaml b/my_tasks/my_task.yaml deleted file mode 100644 index 50d069d9..00000000 --- a/my_tasks/my_task.yaml +++ /dev/null @@ -1,3 +0,0 @@ -task: my_task -include: unitxt -recipe: card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10 \ No newline at end of file diff --git a/my_tasks/unitxt b/my_tasks/unitxt deleted file mode 100644 index a933f4a1..00000000 --- a/my_tasks/unitxt +++ /dev/null @@ -1 +0,0 @@ -class: !function /Users/ronches/miniforge3/envs/lmeval/lib/python3.10/site-packages/lm_eval/tasks/unitxt/task.Unitxt \ No newline at end of file diff --git a/tests/test_unitxt.py b/tests/test_unitxt.py index ce9f6b73..d45c59ed 100644 --- a/tests/test_unitxt.py +++ b/tests/test_unitxt.py @@ -1,5 +1,5 @@ # First Party -from instruclab.eval.unitxt import UnitxtEvaluator +from instructlab.eval.unitxt import UnitxtEvaluator def test_unitxt(): From 1ed42564f056bb08c7c5a26cdbdfc88ab846bab9 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Mon, 21 Oct 2024 10:43:19 +0300 Subject: [PATCH 04/14] remove unneeded print Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/unitxt.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index 226f0ffb..b6136641 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -70,8 +70,6 @@ def run(self,server_url: str | None = None) -> tuple: logger.debug(locals()) os.environ["TOKENIZERS_PARALLELISM"] = "true" results = self._run_mmlu(server_url=server_url, return_all_results=True) - with open('my_tasks/output.txt', 'w') as f: - print(results, file=f) taskname = self.tasks[0] global_scores = results['results'][taskname] global_scores.pop('alias') From 6135b9105c74accf9e194f818991e24783db9461 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Mon, 21 Oct 2024 12:05:34 +0300 Subject: [PATCH 05/14] tasks -> [task] Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/unitxt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index b6136641..8a094cef 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -37,11 +37,11 @@ def __init__( model_path, unitxt_recipe: str, ): - tasks,tasks_dir = self.prepare_unitxt_files(unitxt_recipe) + task,tasks_dir = self.prepare_unitxt_files(unitxt_recipe) super().__init__( model_path = model_path, tasks_dir = tasks_dir, - tasks = tasks, + tasks = [task], few_shots = 0 ) From 5976ce4d2fd69e9b77e8af86c36ea8ab493649da Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Mon, 21 Oct 2024 12:18:53 +0300 Subject: [PATCH 06/14] temp dir prefix Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/unitxt.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index 8a094cef..4877297e 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -20,6 +20,8 @@ logger = setup_logger(__name__) +TEMP_DIR_PREFIX = 'unitxt_temp' + class UnitxtEvaluator(MMLUBranchEvaluator): """ An evaluator class, running Unitxt evaluation @@ -47,17 +49,17 @@ def __init__( def prepare_unitxt_files(self, unitxt_recipe)->tuple: temp_task = str(uuid4()) - temp_tasks_dir = f'unitxt_temp_{temp_task}' + temp_tasks_dir = f'{TEMP_DIR_PREFIX}_{temp_task}' yaml_file = os.path.join(temp_tasks_dir,f"{temp_task}.yaml") create_unitxt_pointer(temp_tasks_dir) create_unitxt_yaml(yaml_file=yaml_file, unitxt_recipe=unitxt_recipe, task_name=temp_task) return temp_task,temp_tasks_dir def remove_temp_files(self): - if self.tasks_dir.startswith('temp_'): #to avoid unintended deletion if this class is inherited + if self.tasks_dir.startswith(TEMP_DIR_PREFIX): #to avoid unintended deletion if this class is inherited shutil.rmtree(self.tasks_dir) else: - logger.warning("unitxt tasks dir did not start with 'temp_' and therefor was not deleted") + logger.warning(f"unitxt tasks dir did not start with '{TEMP_DIR_PREFIX}' and therefor was not deleted") def run(self,server_url: str | None = None) -> tuple: """ From 3bdf3e3b97d611843aa5a97f05de74e4ea44106d Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Tue, 22 Oct 2024 15:39:46 +0300 Subject: [PATCH 07/14] create+delete temp files in run() Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/unitxt.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index 4877297e..365e8ce9 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -39,23 +39,29 @@ def __init__( model_path, unitxt_recipe: str, ): - task,tasks_dir = self.prepare_unitxt_files(unitxt_recipe) + task = self.assign_task_name() + tasks_dir = self.assign_tasks_dir(task) super().__init__( model_path = model_path, tasks_dir = tasks_dir, tasks = [task], few_shots = 0 ) + self.unitxt_recipe = unitxt_recipe - def prepare_unitxt_files(self, unitxt_recipe)->tuple: - temp_task = str(uuid4()) - temp_tasks_dir = f'{TEMP_DIR_PREFIX}_{temp_task}' - yaml_file = os.path.join(temp_tasks_dir,f"{temp_task}.yaml") - create_unitxt_pointer(temp_tasks_dir) - create_unitxt_yaml(yaml_file=yaml_file, unitxt_recipe=unitxt_recipe, task_name=temp_task) - return temp_task,temp_tasks_dir + def assign_tasks_dir(self, task): + return f'{TEMP_DIR_PREFIX}_{task}' - def remove_temp_files(self): + def assign_task_name(self): + return str(uuid4()) + + def prepare_unitxt_files(self)->tuple: + task = self.tasks[0] + yaml_file = os.path.join(self.tasks_dir,f"{task}.yaml") + create_unitxt_pointer(self.tasks_dir) + create_unitxt_yaml(yaml_file=yaml_file, unitxt_recipe=self.unitxt_recipe, task_name=task) + + def remove_unitxt_files(self): if self.tasks_dir.startswith(TEMP_DIR_PREFIX): #to avoid unintended deletion if this class is inherited shutil.rmtree(self.tasks_dir) else: @@ -69,6 +75,7 @@ def run(self,server_url: str | None = None) -> tuple: overall_scores Average scores for the task group individual_scores Individual scores for each task in the task group """ + self.prepare_unitxt_files() logger.debug(locals()) os.environ["TOKENIZERS_PARALLELISM"] = "true" results = self._run_mmlu(server_url=server_url, return_all_results=True) @@ -89,7 +96,7 @@ def run(self,server_url: str | None = None) -> tuple: logger.error(e) logger.error(e.__traceback__) instance_scores = None - self.remove_temp_files() + self.remove_unitxt_files() return global_scores,instance_scores @@ -101,7 +108,7 @@ def create_unitxt_yaml(yaml_file,unitxt_recipe, task_name): } with open(yaml_file, 'w') as file: yaml.dump(data, file, default_flow_style=False) - logger.info(f"task {task} unitxt recipe written to {yaml_file}") + logger.debug(f"task {task} unitxt recipe written to {yaml_file}") def create_unitxt_pointer(tasks_dir): class_line = "class: !function " + task.__file__.replace("task.py", "task.Unitxt") @@ -109,4 +116,4 @@ def create_unitxt_pointer(tasks_dir): os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w') as f: f.write(class_line) - logger.info(f"Unitxt task pointer written to {output_file}") + logger.debug(f"Unitxt task pointer written to {output_file}") From 4b1135787b1c58bf77610b1584776d00c7794793 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Mon, 28 Oct 2024 11:20:42 +0200 Subject: [PATCH 08/14] format: lint Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/mmlu.py | 4 +- src/instructlab/eval/unitxt.py | 86 +++++++++++++++++++--------------- tests/test_unitxt.py | 6 +-- 3 files changed, 52 insertions(+), 44 deletions(-) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index 4c56686e..ed00334f 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -153,7 +153,9 @@ def run(self, server_url: str | None = None) -> tuple: return overall_score, individual_scores - def _run_mmlu(self, server_url: str | None = None, return_all_results:bool = False) -> dict: + def _run_mmlu( + self, server_url: str | None = None, return_all_results: bool = False + ) -> dict: if server_url is not None: # Requires lm_eval >= 0.4.4 model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface" diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index 365e8ce9..b1644e77 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -5,12 +5,13 @@ """ # Standard -import os, shutil -import yaml from uuid import uuid4 +import os +import shutil # Third Party from lm_eval.tasks.unitxt import task +import yaml # First Party from instructlab.eval.mmlu import MMLUBranchEvaluator @@ -20,7 +21,8 @@ logger = setup_logger(__name__) -TEMP_DIR_PREFIX = 'unitxt_temp' +TEMP_DIR_PREFIX = "unitxt_temp" + class UnitxtEvaluator(MMLUBranchEvaluator): """ @@ -29,45 +31,51 @@ class UnitxtEvaluator(MMLUBranchEvaluator): Attributes: model_path absolute path to or name of a huggingface model unitxt_recipe unitxt recipe (see unitxt.ai for more information) - A Recipe holds a complete specification of a unitxt pipeline + A Recipe holds a complete specification of a unitxt pipeline Example: card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10 - + """ + name = "unitxt" + def __init__( self, - model_path, + model_path, unitxt_recipe: str, ): - task = self.assign_task_name() - tasks_dir = self.assign_tasks_dir(task) + unitxt_task = self.assign_task_name() + tasks_dir = self.assign_tasks_dir(unitxt_task) super().__init__( - model_path = model_path, - tasks_dir = tasks_dir, - tasks = [task], - few_shots = 0 + model_path=model_path, tasks_dir=tasks_dir, tasks=[unitxt_task], few_shots=0 ) self.unitxt_recipe = unitxt_recipe - def assign_tasks_dir(self, task): - return f'{TEMP_DIR_PREFIX}_{task}' + def assign_tasks_dir(self, task_name): + return f"{TEMP_DIR_PREFIX}_{task_name}" def assign_task_name(self): return str(uuid4()) - def prepare_unitxt_files(self)->tuple: - task = self.tasks[0] - yaml_file = os.path.join(self.tasks_dir,f"{task}.yaml") + def prepare_unitxt_files(self) -> None: + taskname = self.tasks[0] + yaml_file = os.path.join(str(self.tasks_dir), f"{taskname}.yaml") create_unitxt_pointer(self.tasks_dir) - create_unitxt_yaml(yaml_file=yaml_file, unitxt_recipe=self.unitxt_recipe, task_name=task) + create_unitxt_yaml( + yaml_file=yaml_file, unitxt_recipe=self.unitxt_recipe, task_name=taskname + ) def remove_unitxt_files(self): - if self.tasks_dir.startswith(TEMP_DIR_PREFIX): #to avoid unintended deletion if this class is inherited + if self.tasks_dir.startswith( + TEMP_DIR_PREFIX + ): # to avoid unintended deletion if this class is inherited shutil.rmtree(self.tasks_dir) else: - logger.warning(f"unitxt tasks dir did not start with '{TEMP_DIR_PREFIX}' and therefor was not deleted") + logger.warning( + "unitxt tasks dir did not start with '%s' and therefor was not deleted", + TEMP_DIR_PREFIX, + ) - def run(self,server_url: str | None = None) -> tuple: + def run(self, server_url: str | None = None) -> tuple: """ Runs evaluation @@ -80,40 +88,40 @@ def run(self,server_url: str | None = None) -> tuple: os.environ["TOKENIZERS_PARALLELISM"] = "true" results = self._run_mmlu(server_url=server_url, return_all_results=True) taskname = self.tasks[0] - global_scores = results['results'][taskname] - global_scores.pop('alias') + global_scores = results["results"][taskname] + global_scores.pop("alias") try: - instances = results['samples'][taskname] + instances = results["samples"][taskname] instance_scores = {} - metrics = [metric.replace('metrics.','') for metric in instances[0]['doc']['metrics']] - for i,instance in enumerate(instances): + metrics = [ + metric.replace("metrics.", "") + for metric in instances[0]["doc"]["metrics"] + ] + for i, instance in enumerate(instances): scores = {} for metric in metrics: scores[metric] = instance[metric][0] instance_scores[i] = scores - except Exception as e: + except KeyError as e: logger.error("Error in extracting single instance scores") logger.error(e) logger.error(e.__traceback__) instance_scores = None self.remove_unitxt_files() - return global_scores,instance_scores + return global_scores, instance_scores -def create_unitxt_yaml(yaml_file,unitxt_recipe, task_name): - data = { - 'task': f'{task_name}', - 'include': 'unitxt', - 'recipe': f'{unitxt_recipe}' - } - with open(yaml_file, 'w') as file: +def create_unitxt_yaml(yaml_file, unitxt_recipe, task_name): + data = {"task": f"{task_name}", "include": "unitxt", "recipe": f"{unitxt_recipe}"} + with open(yaml_file, "w", encoding="utf-8") as file: yaml.dump(data, file, default_flow_style=False) - logger.debug(f"task {task} unitxt recipe written to {yaml_file}") + logger.debug("task %s unitxt recipe written to %s", task_name, yaml_file) + def create_unitxt_pointer(tasks_dir): class_line = "class: !function " + task.__file__.replace("task.py", "task.Unitxt") - output_file = os.path.join(tasks_dir,'unitxt') + output_file = os.path.join(tasks_dir, "unitxt") os.makedirs(os.path.dirname(output_file), exist_ok=True) - with open(output_file, 'w') as f: + with open(output_file, "w", encoding="utf-8") as f: f.write(class_line) - logger.debug(f"Unitxt task pointer written to {output_file}") + logger.debug("Unitxt task pointer written to %s", output_file) diff --git a/tests/test_unitxt.py b/tests/test_unitxt.py index d45c59ed..57e587d3 100644 --- a/tests/test_unitxt.py +++ b/tests/test_unitxt.py @@ -7,9 +7,7 @@ def test_unitxt(): try: model_path = "instructlab/granite-7b-lab" unitxt_recipe = "card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10" - unitxt = UnitxtEvaluator( - model_path=model_path, unitxt_recipe=unitxt_recipe - ) + unitxt = UnitxtEvaluator(model_path=model_path, unitxt_recipe=unitxt_recipe) overall_score, single_scores = unitxt.run() print(overall_score) except Exception as exc: @@ -19,4 +17,4 @@ def test_unitxt(): if __name__ == "__main__": - assert test_unitxt() == True \ No newline at end of file + assert test_unitxt() == True From 93bdd7381b8747f535ab7bf07fc0d531d297c7cd Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Mon, 28 Oct 2024 15:26:34 +0200 Subject: [PATCH 09/14] asserting unitxt evaluation returns a score Signed-off-by: Roni Friedman-Melamed --- tests/test_unitxt.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_unitxt.py b/tests/test_unitxt.py index 57e587d3..775dd18c 100644 --- a/tests/test_unitxt.py +++ b/tests/test_unitxt.py @@ -10,9 +10,13 @@ def test_unitxt(): unitxt = UnitxtEvaluator(model_path=model_path, unitxt_recipe=unitxt_recipe) overall_score, single_scores = unitxt.run() print(overall_score) + sample_score = 'f1_micro,none' + assert sample_score in overall_score + assert overall_score[sample_score] > 0 except Exception as exc: print(f"'test_unitxt_branch' failed: {exc}") return False + return True From d70b84f5fe30f2db8a7c90df784a3ab95f49e1df Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Mon, 28 Oct 2024 15:28:29 +0200 Subject: [PATCH 10/14] format: ruff Signed-off-by: Roni Friedman-Melamed --- tests/test_unitxt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_unitxt.py b/tests/test_unitxt.py index 775dd18c..145a0a2b 100644 --- a/tests/test_unitxt.py +++ b/tests/test_unitxt.py @@ -10,13 +10,13 @@ def test_unitxt(): unitxt = UnitxtEvaluator(model_path=model_path, unitxt_recipe=unitxt_recipe) overall_score, single_scores = unitxt.run() print(overall_score) - sample_score = 'f1_micro,none' + sample_score = "f1_micro,none" assert sample_score in overall_score - assert overall_score[sample_score] > 0 + assert overall_score[sample_score] > 0 except Exception as exc: print(f"'test_unitxt_branch' failed: {exc}") return False - + return True From 2faee08f140e7cd13eb77172315200a5e8d93a97 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Tue, 29 Oct 2024 15:35:43 +0200 Subject: [PATCH 11/14] review comments Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/unitxt.py | 14 +++++++++----- tests/test_unitxt.py | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index b1644e77..55426667 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -29,7 +29,7 @@ class UnitxtEvaluator(MMLUBranchEvaluator): An evaluator class, running Unitxt evaluation Attributes: - model_path absolute path to or name of a huggingface model + model_path Absolute path to or name of a huggingface model unitxt_recipe unitxt recipe (see unitxt.ai for more information) A Recipe holds a complete specification of a unitxt pipeline Example: card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10 @@ -71,7 +71,8 @@ def remove_unitxt_files(self): shutil.rmtree(self.tasks_dir) else: logger.warning( - "unitxt tasks dir did not start with '%s' and therefor was not deleted", + "unitxt tasks dir '%s' did not start with '%s' prefix and therefore was not deleted", + self.tasks_dir, TEMP_DIR_PREFIX, ) @@ -79,8 +80,11 @@ def run(self, server_url: str | None = None) -> tuple: """ Runs evaluation + Attributes: + server_url(str|None) Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated + Returns: - overall_scores Average scores for the task group + overall_scores Average scores for the task group individual_scores Individual scores for each task in the task group """ self.prepare_unitxt_files() @@ -111,8 +115,8 @@ def run(self, server_url: str | None = None) -> tuple: return global_scores, instance_scores -def create_unitxt_yaml(yaml_file, unitxt_recipe, task_name): - data = {"task": f"{task_name}", "include": "unitxt", "recipe": f"{unitxt_recipe}"} +def create_unitxt_yaml(yaml_file: str, unitxt_recipe: str, task_name: str) -> None: + data = {"task": task_name, "include": "unitxt", "recipe": unitxt_recipe} with open(yaml_file, "w", encoding="utf-8") as file: yaml.dump(data, file, default_flow_style=False) logger.debug("task %s unitxt recipe written to %s", task_name, yaml_file) diff --git a/tests/test_unitxt.py b/tests/test_unitxt.py index 145a0a2b..4178f8c0 100644 --- a/tests/test_unitxt.py +++ b/tests/test_unitxt.py @@ -9,7 +9,7 @@ def test_unitxt(): unitxt_recipe = "card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10" unitxt = UnitxtEvaluator(model_path=model_path, unitxt_recipe=unitxt_recipe) overall_score, single_scores = unitxt.run() - print(overall_score) + print(f"Overall scores: {overall_score}") sample_score = "f1_micro,none" assert sample_score in overall_score assert overall_score[sample_score] > 0 From 777132a60edb0b3037395018de413fdfd1dfaeb1 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Thu, 7 Nov 2024 10:37:57 +0200 Subject: [PATCH 12/14] simplify run_mmlu return value Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/mmlu.py | 10 +++------- src/instructlab/eval/unitxt.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index ed00334f..f290eb5b 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -142,7 +142,7 @@ def run(self, server_url: str | None = None) -> tuple: agg_score: float = 0.0 results = self._run_mmlu(server_url) - for task, result in results.items(): + for task, result in results['results'].items(): agg_score += float(result["acc,none"]) individual_scores[task] = { "score": float(result["acc,none"]), @@ -154,7 +154,7 @@ def run(self, server_url: str | None = None) -> tuple: return overall_score, individual_scores def _run_mmlu( - self, server_url: str | None = None, return_all_results: bool = False + self, server_url: str | None = None ) -> dict: if server_url is not None: # Requires lm_eval >= 0.4.4 @@ -179,11 +179,7 @@ def _run_mmlu( device=self.device, task_manager=tm, ) - if return_all_results: - results = mmlu_output - else: - results = mmlu_output["results"] - return results + return mmlu_output # This method converts general errors from simple_evaluate # into a more user-understandable error diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index 55426667..dade3021 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -90,7 +90,7 @@ def run(self, server_url: str | None = None) -> tuple: self.prepare_unitxt_files() logger.debug(locals()) os.environ["TOKENIZERS_PARALLELISM"] = "true" - results = self._run_mmlu(server_url=server_url, return_all_results=True) + results = self._run_mmlu(server_url=server_url) taskname = self.tasks[0] global_scores = results["results"][taskname] global_scores.pop("alias") From d9187320ea80730f260c9462183e2a7f70bfbf44 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Thu, 7 Nov 2024 14:12:05 +0200 Subject: [PATCH 13/14] make sure temp files are deleted w finally Signed-off-by: Roni Friedman-Melamed --- src/instructlab/eval/unitxt.py | 46 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/instructlab/eval/unitxt.py b/src/instructlab/eval/unitxt.py index dade3021..46618db5 100644 --- a/src/instructlab/eval/unitxt.py +++ b/src/instructlab/eval/unitxt.py @@ -51,7 +51,7 @@ def __init__( self.unitxt_recipe = unitxt_recipe def assign_tasks_dir(self, task_name): - return f"{TEMP_DIR_PREFIX}_{task_name}" + return os.path.join("eval_output", f"{TEMP_DIR_PREFIX}_{task_name}") def assign_task_name(self): return str(uuid4()) @@ -90,28 +90,30 @@ def run(self, server_url: str | None = None) -> tuple: self.prepare_unitxt_files() logger.debug(locals()) os.environ["TOKENIZERS_PARALLELISM"] = "true" - results = self._run_mmlu(server_url=server_url) - taskname = self.tasks[0] - global_scores = results["results"][taskname] - global_scores.pop("alias") try: - instances = results["samples"][taskname] - instance_scores = {} - metrics = [ - metric.replace("metrics.", "") - for metric in instances[0]["doc"]["metrics"] - ] - for i, instance in enumerate(instances): - scores = {} - for metric in metrics: - scores[metric] = instance[metric][0] - instance_scores[i] = scores - except KeyError as e: - logger.error("Error in extracting single instance scores") - logger.error(e) - logger.error(e.__traceback__) - instance_scores = None - self.remove_unitxt_files() + results = self._run_mmlu(server_url=server_url) + taskname = self.tasks[0] + global_scores = results["results"][taskname] + global_scores.pop("alias") + try: + instances = results["samples"][taskname] + instance_scores = {} + metrics = [ + metric.replace("metrics.", "") + for metric in instances[0]["doc"]["metrics"] + ] + for i, instance in enumerate(instances): + scores = {} + for metric in metrics: + scores[metric] = instance[metric][0] + instance_scores[i] = scores + except KeyError as e: + logger.error("Error in extracting single instance scores") + logger.error(e) + logger.error(e.__traceback__) + instance_scores = None + finally: + self.remove_unitxt_files() return global_scores, instance_scores From 7c9e44c6f1e48a4ae9174a006fd04b41e6a535f4 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Thu, 7 Nov 2024 14:18:02 +0200 Subject: [PATCH 14/14] test in scripts Signed-off-by: Roni Friedman-Melamed --- {tests => scripts}/test_unitxt.py | 0 src/instructlab/eval/mmlu.py | 6 ++---- 2 files changed, 2 insertions(+), 4 deletions(-) rename {tests => scripts}/test_unitxt.py (100%) diff --git a/tests/test_unitxt.py b/scripts/test_unitxt.py similarity index 100% rename from tests/test_unitxt.py rename to scripts/test_unitxt.py diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index f290eb5b..16776d20 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -142,7 +142,7 @@ def run(self, server_url: str | None = None) -> tuple: agg_score: float = 0.0 results = self._run_mmlu(server_url) - for task, result in results['results'].items(): + for task, result in results["results"].items(): agg_score += float(result["acc,none"]) individual_scores[task] = { "score": float(result["acc,none"]), @@ -153,9 +153,7 @@ def run(self, server_url: str | None = None) -> tuple: return overall_score, individual_scores - def _run_mmlu( - self, server_url: str | None = None - ) -> dict: + def _run_mmlu(self, server_url: str | None = None) -> dict: if server_url is not None: # Requires lm_eval >= 0.4.4 model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface"