From b0344a1abc6351aa31faae09b3021f7f09b5cbaf Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 8 Jul 2024 20:05:02 -0700 Subject: [PATCH] update lm-eval to 0.4.3 Signed-off-by: changwangss --- .github/workflows/script/formatScan/pylint.sh | 2 +- .../huggingface/neural_speed/requirements.txt | 2 +- .../inference/requirements.txt | 2 +- .../pruning/requirements.txt | 2 +- .../quantization/requirements.txt | 2 +- .../quantization/requirements_GPU.txt | 2 +- .../quantization/requirements_cpu_woq.txt | 2 +- .../quantization/requirements_sq.txt | 2 +- .../text2text-generation/requirements.txt | 2 +- examples/modelscope/requirements.txt | 2 +- .../neural_chat/requirements_cpu.txt | 2 +- .../neural_chat/requirements_hpu.txt | 2 +- .../neural_chat/requirements_win.txt | 2 +- .../neural_chat/tests/requirements.txt | 2 +- .../llm/evaluation/lm_eval/accuracy.py | 2 +- .../llm/evaluation/lm_eval/evaluator.py | 82 +++++++++++-------- tests/requirements.txt | 2 +- 17 files changed, 62 insertions(+), 52 deletions(-) diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh index 41e22c470b9..8a6eabf9aeb 100644 --- a/.github/workflows/script/formatScan/pylint.sh +++ b/.github/workflows/script/formatScan/pylint.sh @@ -28,7 +28,7 @@ else echo "Not found requirements.txt file." fi # install packages -pip install lm-eval==0.4.2 +pip install lm-eval==0.4.3 pip install accelerate nlpaug nltk schema optimum-intel optimum peft pip install --upgrade --force-reinstall transformers==4.36.2 pip install optimum-habana diff --git a/examples/huggingface/neural_speed/requirements.txt b/examples/huggingface/neural_speed/requirements.txt index 3f7fca6d65d..9dc6fb86dca 100644 --- a/examples/huggingface/neural_speed/requirements.txt +++ b/examples/huggingface/neural_speed/requirements.txt @@ -1,6 +1,6 @@ intel_extension_for_transformers neural-speed -lm-eval==0.4.2 +lm-eval==0.4.3 sentencepiece gguf --extra-index-url https://download.pytorch.org/whl/cpu diff --git a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt index cd6cd604899..168d38111ea 100644 --- a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt +++ b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt @@ -1,4 +1,4 @@ transformers accelerate sentencepiece != 0.1.92 -lm-eval==0.4.2 +lm-eval==0.4.3 diff --git a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt index a1ea63132a8..6f8cc176a85 100644 --- a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt +++ b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt @@ -7,5 +7,5 @@ transformers torch==2.0.1 tqdm neural_compressor -lm-eval==0.4.2 +lm-eval==0.4.3 diff --git a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt index 36ee5a1b55a..dc4594f070f 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt +++ b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt @@ -9,5 +9,5 @@ wandb einops neural-compressor pytest==8.0.0 -lm-eval==0.4.2 +lm-eval==0.4.3 git+https://github.com/huggingface/peft.git@6c44096c7b8d55a2ecf24be9bc68393467e1584a diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt index 1b000e0c61b..15cb3a94d8a 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt @@ -14,4 +14,4 @@ tiktoken #qwen einops #qwen auto-round git+https://github.com/intel/neural-compressor.git -lm-eval==0.4.2 +lm-eval==0.4.3 diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_cpu_woq.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_cpu_woq.txt index 6b26c54da69..6a4e11321aa 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements_cpu_woq.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements_cpu_woq.txt @@ -13,5 +13,5 @@ einops #qwen git+https://github.com/intel/neural-speed.git auto-round==0.2 git+https://github.com/intel/neural-compressor.git -lm-eval==0.4.2 +lm-eval==0.4.3 huggingface_hub diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt index 02655339b5d..b9d5c75461e 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt @@ -13,5 +13,5 @@ transformers_stream_generator tiktoken #qwen einops #qwen git+https://github.com/intel/neural-compressor.git -lm-eval==0.4.2 +lm-eval==0.4.3 huggingface_hub diff --git a/examples/huggingface/pytorch/text2text-generation/requirements.txt b/examples/huggingface/pytorch/text2text-generation/requirements.txt index 73e4ae2e655..ada664326c7 100644 --- a/examples/huggingface/pytorch/text2text-generation/requirements.txt +++ b/examples/huggingface/pytorch/text2text-generation/requirements.txt @@ -11,4 +11,4 @@ neural-compressor optimum-intel > 1.12.0 onnxruntime intel-extension-for-pytorch -lm-eval==0.4.2 +lm-eval==0.4.3 diff --git a/examples/modelscope/requirements.txt b/examples/modelscope/requirements.txt index b04bd189db0..885a22f6c4f 100644 --- a/examples/modelscope/requirements.txt +++ b/examples/modelscope/requirements.txt @@ -1,6 +1,6 @@ intel_extension_for_transformers neural-speed -lm-eval==0.4.2 +lm-eval==0.4.3 sentencepiece gguf --extra-index-url https://download.pytorch.org/whl/cpu diff --git a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt index 7b38113697b..57931710829 100644 --- a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt +++ b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt @@ -7,7 +7,7 @@ fastapi fschat==0.2.32 huggingface_hub intel_extension_for_pytorch==2.3.0 -lm-eval==0.4.2 +lm-eval==0.4.3 neural-compressor neural_speed==1.0a0 numpy==1.23.5 diff --git a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt index f3983b6d3c5..30a53b2709a 100644 --- a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt +++ b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt @@ -4,7 +4,7 @@ evaluate fastapi fschat==0.2.35 huggingface_hub -lm-eval==0.4.2 +lm-eval==0.4.3 neural-compressor numpy==1.23.5 optimum diff --git a/intel_extension_for_transformers/neural_chat/requirements_win.txt b/intel_extension_for_transformers/neural_chat/requirements_win.txt index 56ac6027ab4..74e5097505e 100644 --- a/intel_extension_for_transformers/neural_chat/requirements_win.txt +++ b/intel_extension_for_transformers/neural_chat/requirements_win.txt @@ -6,7 +6,7 @@ fastapi fschat==0.2.35 huggingface_hub intel-extension-for-transformers -lm-eval==0.4.2 +lm-eval==0.4.3 neural-compressor numpy==1.23.5 optimum diff --git a/intel_extension_for_transformers/neural_chat/tests/requirements.txt b/intel_extension_for_transformers/neural_chat/tests/requirements.txt index 97a46d2e502..d2ae8edc6e4 100644 --- a/intel_extension_for_transformers/neural_chat/tests/requirements.txt +++ b/intel_extension_for_transformers/neural_chat/tests/requirements.txt @@ -38,7 +38,7 @@ langchain-community==0.0.27 langchain_core==0.1.35 langid librosa -lm-eval==0.4.2 +lm-eval==0.4.3 markdown neural-compressor neural_speed==1.0a0 diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/accuracy.py b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/accuracy.py index 7cc8ac75012..61bc08c585e 100644 --- a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/accuracy.py +++ b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/accuracy.py @@ -43,7 +43,7 @@ from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval.evaluator import( request_caching_arg_to_dict ) -from lm_eval.logging_utils import WandbLogger +from lm_eval.loggers import WandbLogger from lm_eval.tasks import TaskManager from lm_eval.utils import make_table, simple_parse_args_string diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py index 3dc1c2872a3..d00578ef631 100644 --- a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py +++ b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py @@ -38,7 +38,7 @@ print_writeout, run_task_tests, ) -from lm_eval.logging_utils import add_env_info, get_git_commit_hash +from lm_eval.loggers import add_env_info, get_git_commit_hash from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string from lm_eval import utils @@ -509,9 +509,14 @@ def evaluate( # aggregate results ; run bootstrap CIs for task_output in eval_tasks: task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters) - results, samples, configs, versions, num_fewshot = consolidate_results( - eval_tasks - ) + ( + results, + samples, + configs, + versions, + num_fewshot, + higher_is_better, + ) = consolidate_results(eval_tasks) ### Calculate group metrics ### if bool(results): @@ -522,6 +527,23 @@ def evaluate( # or `task_name: []`. # we only want to operate on groups here. continue + + # collect all higher_is_better values for metrics + # in the group's subtasks. + # TODO: clean this up ; unify with the below metric_list loop? + _higher_is_better = {} + for task in task_list: + for m, h in higher_is_better[task].items(): + if m not in _higher_is_better.keys(): + _higher_is_better[m] = h + if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h: + eval_logger.warning( + f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None." + ) + _higher_is_better[m] = None + higher_is_better[group] = _higher_is_better + + # collect all metric keys used by a subtask in the group. metric_list = list( { key @@ -534,38 +556,20 @@ def evaluate( stderr = "_stderr,".join(metric.split(",")) # gather metrics, sizes, and stderrs from subtasks - metrics = [ - results[task][metric] - for task in task_list - if metric in results[task] - ] # TODO: copy? - stderrs = [ - results[task][stderr] - for task in task_list - if stderr in results[task] - ] - sizes = [ - results[task]["samples"] - for task in task_list - if metric in results[task] - ] + metrics = [results[task][metric] for task in task_list if metric in results[task]] # TODO: copy? + stderrs = [results[task][stderr] for task in task_list if stderr in results[task]] + sizes = [results[task]["samples"] for task in task_list if metric in results[task]] # compute group's pooled metric and stderr - results[group][metric] = ( - lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes) - ) + results[group][metric] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes) # TODO: calculate grouped metric using aggregation fn if "N/A" in stderrs: results[group][stderr] = "N/A" else: - results[group][stderr] = ( - lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) - ) + results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility - # To use the old (likely incorrect) variance formula, - # comment out the above and uncomment this line: - # results[group][stderr] = \ - # lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics) + # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line: + # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics) results[group]["samples"] = sum(sizes) @@ -578,9 +582,7 @@ def evaluate( if len(left_tasks_list) == 0: break - _task_hierarchy = { - k: v for k, v in task_hierarchy.items() if k in left_tasks_list - } + _task_hierarchy = {k: v for k, v in task_hierarchy.items() if k in left_tasks_list} _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results) results_agg = {**results_agg, **_results_agg} @@ -588,9 +590,7 @@ def evaluate( for group_name, task_list in task_hierarchy.items(): if task_list: - num_fewshot[group_name] = num_fewshot[ - task_list[0] - ] # TODO: validate this + num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this results_dict = { "results": dict(results_agg.items()), @@ -599,6 +599,17 @@ def evaluate( "configs": dict(sorted(configs.items())), "versions": dict(sorted(versions.items())), "n-shot": dict(sorted(num_fewshot.items())), + "higher_is_better": dict(sorted(higher_is_better.items())), + "n-samples": { + task_output.task_name: { + "original": len(task_output.task.eval_docs), + "effective": min( + limit if limit else len(task_output.task.eval_docs), + len(task_output.task.eval_docs), + ), + } + for task_output in eval_tasks + }, } if log_samples: results_dict["samples"] = dict(samples) @@ -608,7 +619,6 @@ def evaluate( else: return None - def request_caching_arg_to_dict(cache_requests: str) -> dict: request_caching_args = { "cache_requests": cache_requests in {"true", "refresh"}, diff --git a/tests/requirements.txt b/tests/requirements.txt index d2c2dca3f74..1b28b53ca25 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -12,7 +12,7 @@ git+https://github.com/intel/neural-compressor.git git+https://github.com/intel/neural-speed.git intel-extension-for-pytorch==2.3.0 intel-tensorflow==2.14.0 -lm-eval==0.4.2 +lm-eval==0.4.3 mlflow nlpaug==1.1.9 onnx