From ceb1371ab799e3efa1fae194db4c7121f3bc75c7 Mon Sep 17 00:00:00 2001
From: abzb1 <cxv0519@gmail.com>
Date: Fri, 2 Aug 2024 12:20:40 +0000
Subject: [PATCH] Add SEEDBench 2 Plus

---
 docs/current_tasks.md                         |  1 +
 lmms_eval/tasks/seedbench_2/seedbench_2.yaml  |  2 +-
 .../seedbench_2_plus/seedbench_2_plus.yaml    | 43 +++++++++++++++
 lmms_eval/tasks/seedbench_2_plus/utils.py     | 52 +++++++++++++++++++
 4 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100755 lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml
 create mode 100755 lmms_eval/tasks/seedbench_2_plus/utils.py
diff --git a/docs/current_tasks.md b/docs/current_tasks.md
index 6e0e5238..a9949cb6 100644
--- a/docs/current_tasks.md
+++ b/docs/current_tasks.md
@@ -105,6 +105,7 @@
   - ScreenSpot REG / Instruction Generation (screenspot_reg)
 - SeedBench (seedbench)
 - SeedBench 2 (seedbench_2)
+- SeedBench 2 Plus (seedbench_2_plus)
 - ST-VQA (stvqa)
 - TextCaps (textcaps)
   - TextCaps Validation (textcaps_val)
diff --git a/lmms_eval/tasks/seedbench_2/seedbench_2.yaml b/lmms_eval/tasks/seedbench_2/seedbench_2.yaml
index 5fb8cf78..596e2cc0 100755
--- a/lmms_eval/tasks/seedbench_2/seedbench_2.yaml
+++ b/lmms_eval/tasks/seedbench_2/seedbench_2.yaml
@@ -1,7 +1,7 @@
 dataset_path: lmms-lab/SEED-Bench-2
 dataset_kwargs:
   token: True
-task: "seedbench-2"
+task: "seedbench_2"
 test_split: test
 output_type: generate_until
 doc_to_visual: !function utils.seed_doc_to_visual
diff --git a/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml b/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml
new file mode 100755
index 00000000..c47a3dba
--- /dev/null
+++ b/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml
@@ -0,0 +1,43 @@
+dataset_path: doolayer/SEED-Bench-2-Plus
+dataset_kwargs:
+  token: True
+task: "seedbench_2_plus"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.seed_doc_to_visual
+doc_to_text: !function utils.seed_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 16
+  image_aspect_ratio: original
+# The return value of process_results will be used by metrics
+process_results: !function utils.seed_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: seedbench_2_plus_Chart
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seedbench_2_plus_Map
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seedbench_2_plus_Web
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seedbench_2_plus_all
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+metadata:
+  - version: 0.0
+
+model_specific_prompt_kwargs:
+  llava :
+    img_token : <image>
+    post_prompt : "Answer with the option's letter from the given choices directly."
+  gpt4V :
+    img_token : <image>
+    post_prompt : "Answer with the option's letter from the given choices directly."
+  default :
+    img_token : <image>
+    post_prompt : "Answer with the option's letter from the given choices directly."
\ No newline at end of file
diff --git a/lmms_eval/tasks/seedbench_2_plus/utils.py b/lmms_eval/tasks/seedbench_2_plus/utils.py
new file mode 100755
index 00000000..3182b4bc
--- /dev/null
+++ b/lmms_eval/tasks/seedbench_2_plus/utils.py
@@ -0,0 +1,52 @@
+import json
+
+def seed_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+def parse_choice_img(choice: str, img_token: str):
+    if "jpg" in choice or "png" in choice:
+        return img_token
+    return choice
+
+
+def seed_doc_to_text(doc, model_specific_kwargs=None):
+    question = doc["question"]
+    question.replace("<img>", model_specific_kwargs["img_token"])
+    question += "\n" + f"A. {parse_choice_img(doc['choice_A'], model_specific_kwargs['img_token'])}\n"
+    question += f"B. {parse_choice_img(doc['choice_B'], model_specific_kwargs['img_token'])}\n"
+    question += f"C. {parse_choice_img(doc['choice_C'], model_specific_kwargs['img_token'])}\n"
+    question += f"D. {parse_choice_img(doc['choice_D'], model_specific_kwargs['img_token'])}"
+    
+    return f"{question}\n{model_specific_kwargs['post_prompt']}"
+
+
+def seed_process_result(doc, result):
+    pred = result[0].strip()
+    if len(pred) > 1:
+        pred = pred[0]
+    answer = doc["answer"]
+    data_type = doc["question_image_type"].capitalize()
+
+    return {f"seedbench_2_plus_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seedbench_2_plus_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
+
+
+def seed_aggregation_result(results):
+    total_count = 0
+    total_correct = 0
+    for result in results:
+        if result["pred"].lower().strip() == result["answer"].lower().strip():
+            total_correct += 1
+        total_count += 1
+    return total_correct / total_count if total_count != 0 else 0
+
+
+def seed_aggregation_result_all(results):
+    score = seed_aggregation_result(results)
+    stored_results = []
+    for result in results:
+        stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
+    with open("./seed_submission.json", "w") as f:
+        json.dump(stored_results, f, indent=4)
+    print("Storing files for seed_submission ...")
+
+    return score