diff --git a/lmms_eval/tasks/wild_vision_bench/_default_template_yaml b/lmms_eval/tasks/wild_vision_bench/_default_template_yaml index 33d18d39..9e639958 100644 --- a/lmms_eval/tasks/wild_vision_bench/_default_template_yaml +++ b/lmms_eval/tasks/wild_vision_bench/_default_template_yaml @@ -16,9 +16,33 @@ generation_kwargs: process_results: !function utils.wild_vision_process_results # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results metric_list: - - metric: gpt_eval_score - aggregation: !function utils.wild_vision_aggregation + - metric: raw_scores + aggregation: !function utils.wild_vision_aggregation_raw_scores higher_is_better: true + - metric: elo_scores + aggregation: !function utils.wild_vision_aggregation_elo_scores + higher_is_better: true + - metric: win_rates + aggregation: !function utils.wild_vision_aggregation_win_rates + higher_is_better: true + - metric: judgements_better + aggregation: !function utils.wild_vision_aggregation_judgements_better + higher_is_better: true + - metric: judgements_better_plus + aggregation: !function utils.wild_vision_aggregation_judgements_better_plus + higher_is_better: true + - metric: judgements_worse + aggregation: !function utils.wild_vision_aggregation_judgements_worse + higher_is_better: false + - metric: judgements_worse_plus + aggregation: !function utils.wild_vision_aggregation_judgements_worse_plus + higher_is_better: false + - metric: judgements_tie + aggregation: !function utils.wild_vision_aggregation_judgements_tie + higher_is_better: false + - metric: judgements_unclear + aggregation: !function utils.wild_vision_aggregation_judgements_unclear + higher_is_better: false metadata: judge_model: gpt-4o baseline_model: claude-3-sonnet-20240229 diff --git a/lmms_eval/tasks/wild_vision_bench/utils.py b/lmms_eval/tasks/wild_vision_bench/utils.py index f254157f..b3bcbc68 100644 --- a/lmms_eval/tasks/wild_vision_bench/utils.py +++ b/lmms_eval/tasks/wild_vision_bench/utils.py @@ -158,25 +158,34 @@ def wild_vision_process_results(doc, results): score = resps if "A>B" in score: + raw_score = -1 winner = "model_a" judgement = "Worse" # Baseline better elif "A>>B" in score: + raw_score = -2 winner = "model_a" judgement = "Worse++" elif "A=B" in score: + raw_score = 0 winner = "tie" judgement = "Tie" elif "B>A" in score: + raw_score = 1 winner = "model_b" judgement = "Better" elif "B>>A" in score: + raw_score = 2 winner = "model_b" judgement = "Better++" else: + raw_score = 0 winner = "tie" judgement = "Unclear" return { + "raw_scores": { + "final_score": raw_score, + }, "elo_scores": { "question": doc["instruction"], "model_a": BASELINE_MODEL_NAME, @@ -292,6 +301,13 @@ def get_win_rate_column(df, column, baseline): return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2)) +def wild_vision_aggregation_raw_scores(results): + total_score = 0 + for result in results: + total_score += result["final_score"] + return total_score + + def wild_vision_aggregation_elo_scores(results): battles = prepare_elo_data(results) elo_ratings = compute_mle_elo(battles, BASELINE_MODEL_NAME)