Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

update lm-eval to 0.4.3 #1658

Merged
merged 6 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/script/formatScan/pylint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ else
echo "Not found requirements.txt file."
fi
# install packages
pip install lm-eval==0.4.2
pip install lm-eval==0.4.3
pip install accelerate nlpaug nltk schema optimum-intel optimum peft
pip install --upgrade --force-reinstall transformers==4.36.2
pip install optimum-habana
Expand Down
2 changes: 1 addition & 1 deletion examples/huggingface/neural_speed/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
intel_extension_for_transformers
neural-speed
lm-eval==0.4.2
lm-eval==0.4.3
sentencepiece
gguf
--extra-index-url https://download.pytorch.org/whl/cpu
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
transformers
accelerate
sentencepiece != 0.1.92
lm-eval==0.4.2
lm-eval==0.4.3
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ transformers
torch==2.0.1
tqdm
neural_compressor
lm-eval==0.4.2
lm-eval==0.4.3

Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ wandb
einops
neural-compressor
pytest==8.0.0
lm-eval==0.4.2
lm-eval==0.4.3
git+https://github.com/huggingface/peft.git@6c44096c7b8d55a2ecf24be9bc68393467e1584a
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ tiktoken #qwen
einops #qwen
auto-round
git+https://github.com/intel/neural-compressor.git
lm-eval==0.4.2
lm-eval==0.4.3
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ einops #qwen
git+https://github.com/intel/neural-speed.git
auto-round==0.2
git+https://github.com/intel/neural-compressor.git
lm-eval==0.4.2
lm-eval==0.4.3
huggingface_hub
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ transformers_stream_generator
tiktoken #qwen
einops #qwen
git+https://github.com/intel/neural-compressor.git
lm-eval==0.4.2
lm-eval==0.4.3
huggingface_hub
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ neural-compressor
optimum-intel > 1.12.0
onnxruntime
intel-extension-for-pytorch
lm-eval==0.4.2
lm-eval==0.4.3
2 changes: 1 addition & 1 deletion examples/modelscope/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
intel_extension_for_transformers
neural-speed
lm-eval==0.4.2
lm-eval==0.4.3
sentencepiece
gguf
--extra-index-url https://download.pytorch.org/whl/cpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ fastapi
fschat==0.2.32
huggingface_hub
intel_extension_for_pytorch==2.3.0
lm-eval==0.4.2
lm-eval==0.4.3
neural-compressor
neural_speed==1.0a0
numpy==1.23.5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ evaluate
fastapi
fschat==0.2.35
huggingface_hub
lm-eval==0.4.2
lm-eval==0.4.3
neural-compressor
numpy==1.23.5
optimum
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ fastapi
fschat==0.2.35
huggingface_hub
intel-extension-for-transformers
lm-eval==0.4.2
lm-eval==0.4.3
neural-compressor
numpy==1.23.5
optimum
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ langchain-community==0.0.27
langchain_core==0.1.35
langid
librosa
lm-eval==0.4.2
lm-eval==0.4.3
markdown
neural-compressor
neural_speed==1.0a0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval.evaluator import(
request_caching_arg_to_dict
)
from lm_eval.logging_utils import WandbLogger
from lm_eval.loggers import WandbLogger
from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
print_writeout,
run_task_tests,
)
from lm_eval.logging_utils import add_env_info, get_git_commit_hash
from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
from lm_eval import utils
Expand Down Expand Up @@ -509,9 +509,14 @@ def evaluate(
# aggregate results ; run bootstrap CIs
for task_output in eval_tasks:
task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
results, samples, configs, versions, num_fewshot = consolidate_results(
eval_tasks
)
(
results,
samples,
configs,
versions,
num_fewshot,
higher_is_better,
) = consolidate_results(eval_tasks)

### Calculate group metrics ###
if bool(results):
Expand All @@ -522,6 +527,24 @@ def evaluate(
# or `task_name: []`.
# we only want to operate on groups here.
continue

# collect all higher_is_better values for metrics
# in the group's subtasks.
# TODO: clean this up ; unify with the below metric_list loop?
_higher_is_better = {}
for task in task_list:
for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys():
_higher_is_better[m] = h
if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h:
eval_logger.warning(
f"Higher_is_better values for metric {m} in group {group} are not consistent." +
f"Defaulting to None."
)
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better

# collect all metric keys used by a subtask in the group.
metric_list = list(
{
key
Expand All @@ -534,38 +557,22 @@ def evaluate(
stderr = "_stderr,".join(metric.split(","))

# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
metrics = [results[task][metric] for task in task_list if metric in results[task]] # TODO: copy?
stderrs = [results[task][stderr] for task in task_list if stderr in results[task]]
sizes = [results[task]["samples"] for task in task_list if metric in results[task]]

# compute group's pooled metric and stderr
results[group][metric] = (
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
)
results[group][metric] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
)
results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula,
# comment out the above and uncomment this line:
# results[group][stderr] = \
# lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs,
# sizes, metrics=metrics)

results[group]["samples"] = sum(sizes)

Expand All @@ -578,19 +585,15 @@ def evaluate(
if len(left_tasks_list) == 0:
break

_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_task_hierarchy = {k: v for k, v in task_hierarchy.items() if k in left_tasks_list}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)

results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}

for group_name, task_list in task_hierarchy.items():
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this

results_dict = {
"results": dict(results_agg.items()),
Expand All @@ -599,6 +602,17 @@ def evaluate(
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
"higher_is_better": dict(sorted(higher_is_better.items())),
"n-samples": {
task_output.task_name: {
"original": len(task_output.task.eval_docs),
"effective": min(
limit if limit else len(task_output.task.eval_docs),
len(task_output.task.eval_docs),
),
}
for task_output in eval_tasks
},
}
if log_samples:
results_dict["samples"] = dict(samples)
Expand All @@ -608,7 +622,6 @@ def evaluate(
else:
return None


def request_caching_arg_to_dict(cache_requests: str) -> dict:
request_caching_args = {
"cache_requests": cache_requests in {"true", "refresh"},
Expand Down
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ git+https://github.com/intel/neural-compressor.git
git+https://github.com/intel/neural-speed.git
intel-extension-for-pytorch==2.3.0
intel-tensorflow==2.14.0
lm-eval==0.4.2
lm-eval==0.4.3
mlflow
nlpaug==1.1.9
onnx
Expand Down
Loading