Skip to content

Commit 121a86f

Browse files
committed
fix saving and add test script
1 parent 3faeddb commit 121a86f

File tree

2 files changed

+30
-19
lines changed

2 files changed

+30
-19
lines changed

scripts/test_leaderboard.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# First Party
2+
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
3+
4+
if __name__ == "__main__":
5+
evaluator = LeaderboardV2Evaluator(
6+
model_path="ibm-granite/granite-3.1-8b-instruct",
7+
)
8+
results = evaluator.run()
9+
print("got results from leaderboard v2")
10+
print(json.dumps(results, indent=2))

src/instructlab/eval/leaderboard.py

+20-19
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
169169
p.join()
170170

171171
# extract the result which is not None
172-
assert (
173-
len([res for res in results.values() if res is not None]) == 1
174-
), "we expect exactly 1 process to return a results dict properly"
172+
assert len([res for res in results.values() if res is not None]) == 1, (
173+
"we expect exactly 1 process to return a results dict properly"
174+
)
175175
results_dict = [res for res in results.values() if res is not None][0]
176176
return results_dict
177177

@@ -237,9 +237,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
237237
parsed_scores = parse_multitask_results(
238238
result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
239239
)
240-
assert (
241-
len(parsed_scores["subtasks"]) == 24
242-
), "there should be 24 subtasks of bbh run"
240+
assert len(parsed_scores["subtasks"]) == 24, (
241+
"there should be 24 subtasks of bbh run"
242+
)
243243
return parsed_scores
244244

245245

@@ -290,9 +290,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
290290
scores.append(value)
291291
target_metrics.remove(metric)
292292

293-
assert (
294-
len(scores) == 2
295-
), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
293+
assert len(scores) == 2, (
294+
f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
295+
)
296296
return {
297297
"score": sum(scores) / 2,
298298
}
@@ -316,9 +316,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
316316
parsed_scores = parse_multitask_results(
317317
result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
318318
)
319-
assert (
320-
len(parsed_scores["subtasks"]) == 3
321-
), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
319+
assert len(parsed_scores["subtasks"]) == 3, (
320+
f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
321+
)
322322
return parsed_scores
323323

324324

@@ -329,9 +329,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
329329
parsed_scores = parse_multitask_results(
330330
result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
331331
)
332-
assert (
333-
len(parsed_scores["subtasks"]) == 7
334-
), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
332+
assert len(parsed_scores["subtasks"]) == 7, (
333+
f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
334+
)
335335
return parsed_scores
336336

337337

@@ -366,9 +366,9 @@ def get_scores_from_result_dicts(
366366
# this is just a sanity check step
367367
benchmarks_already_covered = set(parsed_scores.keys())
368368
overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
369-
assert (
370-
len(benchmarks_already_covered & benchmarks_to_parse) == 0
371-
), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
369+
assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
370+
f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
371+
)
372372

373373
# now actually add them
374374
for benchmark in benchmarks_to_parse:
@@ -506,7 +506,8 @@ def save_to_file(self, output_file: str = None):
506506

507507
# create the directory if it doesn't exist
508508
output_dir = os.path.dirname(output_file)
509-
os.makedirs(output_dir, exist_ok=True)
509+
if output_dir:
510+
os.makedirs(output_dir, exist_ok=True)
510511
with open(output_file, "w") as f:
511512
json.dump(self._results, f, indent=2)
512513

0 commit comments

Comments
 (0)