@@ -169,9 +169,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
169
169
p .join ()
170
170
171
171
# extract the result which is not None
172
- assert (
173
- len ([ res for res in results . values () if res is not None ]) == 1
174
- ), "we expect exactly 1 process to return a results dict properly"
172
+ assert len ([ res for res in results . values () if res is not None ]) == 1 , (
173
+ "we expect exactly 1 process to return a results dict properly"
174
+ )
175
175
results_dict = [res for res in results .values () if res is not None ][0 ]
176
176
return results_dict
177
177
@@ -237,9 +237,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
237
237
parsed_scores = parse_multitask_results (
238
238
result_dict , LeaderboardV2Tasks .BBH .value , "acc_norm"
239
239
)
240
- assert (
241
- len ( parsed_scores [ "subtasks" ]) == 24
242
- ), "there should be 24 subtasks of bbh run"
240
+ assert len ( parsed_scores [ "subtasks" ]) == 24 , (
241
+ "there should be 24 subtasks of bbh run"
242
+ )
243
243
return parsed_scores
244
244
245
245
@@ -290,9 +290,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
290
290
scores .append (value )
291
291
target_metrics .remove (metric )
292
292
293
- assert (
294
- len (scores ) == 2
295
- ), f"there should only be 2 values extracted in ifeval, got: { len ( scores ) } "
293
+ assert len ( scores ) == 2 , (
294
+ f"there should only be 2 values extracted in ifeval, got: { len (scores )} "
295
+ )
296
296
return {
297
297
"score" : sum (scores ) / 2 ,
298
298
}
@@ -316,9 +316,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
316
316
parsed_scores = parse_multitask_results (
317
317
result_dict , LeaderboardV2Tasks .GPQA .value , "acc_norm"
318
318
)
319
- assert (
320
- len (parsed_scores [" subtasks" ]) == 3
321
- ), f"Expected 3 gpqa scores, got { len ( parsed_scores [ 'subtasks' ]) } "
319
+ assert len ( parsed_scores [ "subtasks" ]) == 3 , (
320
+ f"Expected 3 gpqa scores, got { len (parsed_scores [' subtasks' ]) } "
321
+ )
322
322
return parsed_scores
323
323
324
324
@@ -329,9 +329,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
329
329
parsed_scores = parse_multitask_results (
330
330
result_dict , LeaderboardV2Tasks .MATH_HARD .value , "exact_match"
331
331
)
332
- assert (
333
- len (parsed_scores [" subtasks" ]) == 7
334
- ), f"leaderboard_math_hard should have 7 subtasks, found: { len ( parsed_scores [ 'subtasks' ]) } "
332
+ assert len ( parsed_scores [ "subtasks" ]) == 7 , (
333
+ f"leaderboard_math_hard should have 7 subtasks, found: { len (parsed_scores [' subtasks' ]) } "
334
+ )
335
335
return parsed_scores
336
336
337
337
@@ -366,9 +366,9 @@ def get_scores_from_result_dicts(
366
366
# this is just a sanity check step
367
367
benchmarks_already_covered = set (parsed_scores .keys ())
368
368
overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
369
- assert (
370
- len ( benchmarks_already_covered & benchmarks_to_parse ) == 0
371
- ), f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
369
+ assert len ( benchmarks_already_covered & benchmarks_to_parse ) == 0 , (
370
+ f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
371
+ )
372
372
373
373
# now actually add them
374
374
for benchmark in benchmarks_to_parse :
@@ -506,7 +506,8 @@ def save_to_file(self, output_file: str = None):
506
506
507
507
# create the directory if it doesn't exist
508
508
output_dir = os .path .dirname (output_file )
509
- os .makedirs (output_dir , exist_ok = True )
509
+ if output_dir :
510
+ os .makedirs (output_dir , exist_ok = True )
510
511
with open (output_file , "w" ) as f :
511
512
json .dump (self ._results , f , indent = 2 )
512
513
0 commit comments