From ceb1371ab799e3efa1fae194db4c7121f3bc75c7 Mon Sep 17 00:00:00 2001 From: abzb1 Date: Fri, 2 Aug 2024 12:20:40 +0000 Subject: [PATCH] Add SEEDBench 2 Plus --- docs/current_tasks.md | 1 + lmms_eval/tasks/seedbench_2/seedbench_2.yaml | 2 +- .../seedbench_2_plus/seedbench_2_plus.yaml | 43 +++++++++++++++ lmms_eval/tasks/seedbench_2_plus/utils.py | 52 +++++++++++++++++++ 4 files changed, 97 insertions(+), 1 deletion(-) create mode 100755 lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml create mode 100755 lmms_eval/tasks/seedbench_2_plus/utils.py diff --git a/docs/current_tasks.md b/docs/current_tasks.md index 6e0e5238..a9949cb6 100644 --- a/docs/current_tasks.md +++ b/docs/current_tasks.md @@ -105,6 +105,7 @@ - ScreenSpot REG / Instruction Generation (screenspot_reg) - SeedBench (seedbench) - SeedBench 2 (seedbench_2) +- SeedBench 2 Plus (seedbench_2_plus) - ST-VQA (stvqa) - TextCaps (textcaps) - TextCaps Validation (textcaps_val) diff --git a/lmms_eval/tasks/seedbench_2/seedbench_2.yaml b/lmms_eval/tasks/seedbench_2/seedbench_2.yaml index 5fb8cf78..596e2cc0 100755 --- a/lmms_eval/tasks/seedbench_2/seedbench_2.yaml +++ b/lmms_eval/tasks/seedbench_2/seedbench_2.yaml @@ -1,7 +1,7 @@ dataset_path: lmms-lab/SEED-Bench-2 dataset_kwargs: token: True -task: "seedbench-2" +task: "seedbench_2" test_split: test output_type: generate_until doc_to_visual: !function utils.seed_doc_to_visual diff --git a/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml b/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml new file mode 100755 index 00000000..c47a3dba --- /dev/null +++ b/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml @@ -0,0 +1,43 @@ +dataset_path: doolayer/SEED-Bench-2-Plus +dataset_kwargs: + token: True +task: "seedbench_2_plus" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.seed_doc_to_visual +doc_to_text: !function utils.seed_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 16 + image_aspect_ratio: original +# The return value of process_results will be used by metrics +process_results: !function utils.seed_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: seedbench_2_plus_Chart + aggregation: !function utils.seed_aggregation_result + higher_is_better: true + - metric: seedbench_2_plus_Map + aggregation: !function utils.seed_aggregation_result + higher_is_better: true + - metric: seedbench_2_plus_Web + aggregation: !function utils.seed_aggregation_result + higher_is_better: true + - metric: seedbench_2_plus_all + aggregation: !function utils.seed_aggregation_result + higher_is_better: true +metadata: + - version: 0.0 + +model_specific_prompt_kwargs: + llava : + img_token : + post_prompt : "Answer with the option's letter from the given choices directly." + gpt4V : + img_token : + post_prompt : "Answer with the option's letter from the given choices directly." + default : + img_token : + post_prompt : "Answer with the option's letter from the given choices directly." \ No newline at end of file diff --git a/lmms_eval/tasks/seedbench_2_plus/utils.py b/lmms_eval/tasks/seedbench_2_plus/utils.py new file mode 100755 index 00000000..3182b4bc --- /dev/null +++ b/lmms_eval/tasks/seedbench_2_plus/utils.py @@ -0,0 +1,52 @@ +import json + +def seed_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + +def parse_choice_img(choice: str, img_token: str): + if "jpg" in choice or "png" in choice: + return img_token + return choice + + +def seed_doc_to_text(doc, model_specific_kwargs=None): + question = doc["question"] + question.replace("", model_specific_kwargs["img_token"]) + question += "\n" + f"A. {parse_choice_img(doc['choice_A'], model_specific_kwargs['img_token'])}\n" + question += f"B. {parse_choice_img(doc['choice_B'], model_specific_kwargs['img_token'])}\n" + question += f"C. {parse_choice_img(doc['choice_C'], model_specific_kwargs['img_token'])}\n" + question += f"D. {parse_choice_img(doc['choice_D'], model_specific_kwargs['img_token'])}" + + return f"{question}\n{model_specific_kwargs['post_prompt']}" + + +def seed_process_result(doc, result): + pred = result[0].strip() + if len(pred) > 1: + pred = pred[0] + answer = doc["answer"] + data_type = doc["question_image_type"].capitalize() + + return {f"seedbench_2_plus_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seedbench_2_plus_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}} + + +def seed_aggregation_result(results): + total_count = 0 + total_correct = 0 + for result in results: + if result["pred"].lower().strip() == result["answer"].lower().strip(): + total_correct += 1 + total_count += 1 + return total_correct / total_count if total_count != 0 else 0 + + +def seed_aggregation_result_all(results): + score = seed_aggregation_result(results) + stored_results = [] + for result in results: + stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]}) + with open("./seed_submission.json", "w") as f: + json.dump(stored_results, f, indent=4) + print("Storing files for seed_submission ...") + + return score