From 1fad5d99be0db1822ce2b490eaff0c87ee5794f7 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Sat, 30 Nov 2024 19:38:54 +0800
Subject: [PATCH 01/10] reformat mix eval

---
 .../audio2text/mix_evals_audio2text.yaml      |  3 ++
 .../tasks/mix_evals/mix_evals_video2text.yaml |  5 ---
 .../{ => video2text}/_default_template_yaml   |  2 +-
 .../video2text/mix_evals_video2text.yaml      |  5 +++
 .../mix_evals_video2text_freeform.yaml        |  0
 .../mix_evals_video2text_mc.yaml              |  0
 .../mix_evals_video2text_openended.yaml       |  6 +--
 .../tasks/mix_evals/{ => video2text}/utils.py | 38 +++++++++++++------
 8 files changed, 38 insertions(+), 21 deletions(-)
 create mode 100644 lmms_eval/tasks/mix_evals/audio2text/mix_evals_audio2text.yaml
 delete mode 100644 lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml
 rename lmms_eval/tasks/mix_evals/{ => video2text}/_default_template_yaml (89%)
 create mode 100644 lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml
 rename lmms_eval/tasks/mix_evals/{ => video2text}/mix_evals_video2text_freeform.yaml (100%)
 rename lmms_eval/tasks/mix_evals/{ => video2text}/mix_evals_video2text_mc.yaml (100%)
 rename lmms_eval/tasks/mix_evals/{ => video2text}/mix_evals_video2text_openended.yaml (88%)
 rename lmms_eval/tasks/mix_evals/{ => video2text}/utils.py (91%)

diff --git a/lmms_eval/tasks/mix_evals/audio2text/mix_evals_audio2text.yaml b/lmms_eval/tasks/mix_evals/audio2text/mix_evals_audio2text.yaml
new file mode 100644
index 00000000..85b23377
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/audio2text/mix_evals_audio2text.yaml
@@ -0,0 +1,3 @@
+group: mix_evals_audio2text
+task:
+- mix_evals_audio2_text_freeform
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml
deleted file mode 100644
index e49612a8..00000000
--- a/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-group: mix_evals_video2text
-task:
-# - mix_evals_video2text_openconv
-- mix_evals_video2text_mc
-- mix_evals_video2text_freeform
\ No newline at end of file
diff --git a/lmms_eval/tasks/mix_evals/_default_template_yaml b/lmms_eval/tasks/mix_evals/video2text/_default_template_yaml
similarity index 89%
rename from lmms_eval/tasks/mix_evals/_default_template_yaml
rename to lmms_eval/tasks/mix_evals/video2text/_default_template_yaml
index bda3f8e8..73473c72 100644
--- a/lmms_eval/tasks/mix_evals/_default_template_yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/_default_template_yaml
@@ -2,7 +2,7 @@ dataset_kwargs:
   cache_dir: mix_evals_video2text
   token: true
   video: true
-dataset_path: lmms-lab/MixEvals_Video2Text
+dataset_path: MixEval/MixEval-X
 lmms_eval_specific_kwargs:
   default:
     post_prompt: ""
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml
new file mode 100644
index 00000000..43fc1133
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml
@@ -0,0 +1,5 @@
+group: mix_evals_video2text
+task:
+- mix_evals_video2text_mc
+- mix_evals_video2text_freeform
+- mix_evals_video2text_openended
\ No newline at end of file
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
similarity index 100%
rename from lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml
rename to lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
similarity index 100%
rename from lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml
rename to lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_openended.yaml
similarity index 88%
rename from lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml
rename to lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_openended.yaml
index a62b2818..7d0baea8 100644
--- a/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_openended.yaml
@@ -1,7 +1,7 @@
 include: _default_template_yaml
-dataset_name: "video2text_openended"
-task: "mix_evals_video2text_openconv"
-test_split: test
+dataset_name: "open_ended"
+task: "mix_evals_video2text_openended"
+test_split: video2text
 output_type: generate_until
 doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
 doc_to_text: !function utils.mix_evals_video2text_doc_to_text_open_convs
diff --git a/lmms_eval/tasks/mix_evals/utils.py b/lmms_eval/tasks/mix_evals/video2text/utils.py
similarity index 91%
rename from lmms_eval/tasks/mix_evals/utils.py
rename to lmms_eval/tasks/mix_evals/video2text/utils.py
index cd1d8e5b..40e545fd 100644
--- a/lmms_eval/tasks/mix_evals/utils.py
+++ b/lmms_eval/tasks/mix_evals/video2text/utils.py
@@ -115,18 +115,32 @@ def get_eval(model_response: str, ground_truth: str, max_tokens: int, retries: i
 cache_dir = os.path.join(cache_dir)
 
 
-# Pass in video path here
-# Can only work correctly with video llm
+def mix_evals_doc_to_visual(doc, modality):
+    visual = []
+    for video_path in doc["input_file"]:
+        video_path = os.path.join(cache_dir, video_path)
+        if os.path.exists(video_path):
+            video_path = video_path
+        elif os.path.exists(video_path.replace("mp4", "MP4")):
+            video_path = video_path.replace("mp4", "MP4")
+        else:
+            sys.exit(f"video path:{video_path} does not exist, please check")
+
+        if modality == "video":
+            visual.append(video_path)
+        elif modality == "image":
+            visual.append(video_path)
+        else:
+            sys.exit(f"modality:{modality} is not supported, please check")
+    return visual
+
+
 def mix_evals_video2text_doc_to_visual(doc):
-    video_path = doc["video_path"]
-    video_path = os.path.join(cache_dir, video_path)
-    if os.path.exists(video_path):
-        video_path = video_path
-    elif os.path.exists(video_path.replace("mp4", "MP4")):
-        video_path = video_path.replace("mp4", "MP4")
-    else:
-        sys.exit(f"video path:{video_path} does not exist, please check")
-    return [video_path]
+    return mix_evals_doc_to_visual(doc, "video")
+
+
+def mix_evals_image2text_doc_to_visual(doc):
+    return mix_evals_doc_to_visual(doc, "image")
 
 
 # This is the place where you format your question
@@ -140,7 +154,7 @@ def mix_evals_video2text_doc_to_text(doc, lmms_eval_specific_kwargs=None):
     if "post_prompt" in lmms_eval_specific_kwargs:
         post_prompt = lmms_eval_specific_kwargs["post_prompt"]
 
-    user_prompt = doc["prompt"]
+    user_prompt = doc["query"]
 
     if "options" in doc:
         option_prompt = "Here are the options:\n"

From 715df25ce4463381f4881a728f0bb6f583fbedd2 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Sat, 30 Nov 2024 20:54:09 +0800
Subject: [PATCH 02/10] video2text fix

---
 .../mix_evals/video2text/mix_evals_video2text_freeform.yaml | 6 +++---
 .../tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml | 6 +++---
 lmms_eval/tasks/mix_evals/video2text/utils.py               | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
index e4495b50..e4800d60 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
@@ -1,10 +1,10 @@
-dataset_name: "video2text_closeended_free-form"
 task: "mix_evals_video2text_freeform"
-test_split: test
+dataset_name: "video2text"
+test_split: free_form
 output_type: generate_until
 doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
 doc_to_text: !function utils.mix_evals_video2text_doc_to_text
-doc_to_target: "{{target}}"
+doc_to_target: "{{reference_answer}}"
 process_results: !function utils.mix_evals_video2text_process_results_freeform
 metric_list:
   - metric: gpt_eval
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
index fcca0731..ade74701 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
@@ -1,11 +1,11 @@
 include: _default_template_yaml
-dataset_name: "video2text_closeended_multiple-choice"
 task: "mix_evals_video2text_mc"
-test_split: test
+dataset_name: "video2text"
+test_split: multiple_choice
 output_type: generate_until
 doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
 doc_to_text: !function utils.mix_evals_video2text_doc_to_text
-doc_to_target: "{{target}}"
+doc_to_target: "{{reference_answer}}"
 
 generation_kwargs:
   max_new_tokens: 5
diff --git a/lmms_eval/tasks/mix_evals/video2text/utils.py b/lmms_eval/tasks/mix_evals/video2text/utils.py
index 40e545fd..cd4da6f5 100644
--- a/lmms_eval/tasks/mix_evals/video2text/utils.py
+++ b/lmms_eval/tasks/mix_evals/video2text/utils.py
@@ -232,13 +232,13 @@ def mix_evals_video2text_process_results_open_convs(doc, result):
 
 def mix_evals_video2text_process_results_freeform(doc, result):
     pred = result[0]
-    ground_truth_str = ", ".join([f'"{gt}"' for gt in doc["target"]])
+    ground_truth_str = ", ".join([f'"{gt}"' for gt in doc["reference_answer"]])
     ground_truth_str = f"[{ground_truth_str}]"
     content = eval_prompt.format(model_response=pred, ground_truth=ground_truth_str)
     eval_answer, model_name = get_eval(model_response=pred, ground_truth=ground_truth_str, max_tokens=1024)
     return {
-        "submission": {"pred": pred, "question_idx": doc["question_index"], "target": doc["target"], "eval_answer": eval_answer, "gpt_prompt": content},
-        "gpt_eval": {"pred": pred, "question_idx": doc["question_index"], "target": doc["target"], "eval_answer": eval_answer, "gpt_prompt": content},
+        "submission": {"pred": pred, "question_idx": doc["id"], "target": doc["reference_answer"], "eval_answer": eval_answer, "gpt_prompt": content},
+        "gpt_eval": {"pred": pred, "question_idx": doc["id"], "target": doc["reference_answer"], "eval_answer": eval_answer, "gpt_prompt": content},
     }
 
 

From cc232f7bd2f7606d28f70834e296e0b8e2eba427 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Sun, 1 Dec 2024 17:04:38 +0800
Subject: [PATCH 03/10] video

---
 lmms_eval/filters/extraction.py               |   3 +
 .../video2text/mix_evals_video2text.yaml      |   2 +-
 .../mix_evals_video2text_freeform_hard.yaml   |  25 ++
 .../video2text/mix_evals_video2text_hard.yaml |   5 +
 .../video2text/mix_evals_video2text_mc.yaml   |   5 +-
 .../mix_evals_video2text_mc_hard.yaml         |  31 +++
 lmms_eval/tasks/mix_evals/video2text/utils.py | 250 ++++++++++++------
 7 files changed, 237 insertions(+), 84 deletions(-)
 create mode 100644 lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform_hard.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_hard.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc_hard.yaml

diff --git a/lmms_eval/filters/extraction.py b/lmms_eval/filters/extraction.py
index 392e21ad..9dbc212d 100755
--- a/lmms_eval/filters/extraction.py
+++ b/lmms_eval/filters/extraction.py
@@ -1,7 +1,10 @@
+import os
 import re
 import sys
 import unicodedata
 
+import openai
+
 from lmms_eval.api.filter import Filter
 
 
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml
index 43fc1133..4f3e2c8a 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml
@@ -2,4 +2,4 @@ group: mix_evals_video2text
 task:
 - mix_evals_video2text_mc
 - mix_evals_video2text_freeform
-- mix_evals_video2text_openended
\ No newline at end of file
+# - mix_evals_video2text_openended
\ No newline at end of file
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform_hard.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform_hard.yaml
new file mode 100644
index 00000000..37690431
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform_hard.yaml
@@ -0,0 +1,25 @@
+task: "mix_evals_video2text_freeform_hard"
+dataset_name: "video2text"
+test_split: free_form_hard
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_video2text_doc_to_text
+doc_to_target: "{{reference_answer}}"
+process_results: !function utils.mix_evals_video2text_process_results_freeform
+metric_list:
+  - metric: gpt_eval
+    aggregation: !function utils.mix_evals_video2text_gpt_eval
+    higher_is_better: true
+
+generation_kwargs:
+  max_new_tokens: 16
+
+include: _default_template_yaml
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: "Answer the question using a single word or phrase."
+  gpt4v:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video with a short phrase."
+    post_prompt: ""
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_hard.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_hard.yaml
new file mode 100644
index 00000000..2817b420
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_hard.yaml
@@ -0,0 +1,5 @@
+group: mix_evals_video2text_hard
+task:
+- mix_evals_video2text_mc_hard
+- mix_evals_video2text_freeform_hard
+# - mix_evals_video2text_openended
\ No newline at end of file
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
index ade74701..1e36fab9 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
@@ -20,10 +20,7 @@ metric_list:
 filter_list:
   - name: "flexible-extract"
     filter:
-      - function: !function utils.MultiChoiceRegexFilter
-        group_select: 0
-        ignore_case: true
-        ignore_punctuation: true
+      - function: !function utils.GPTMultiChoiceFilter
 
 lmms_eval_specific_kwargs:
   default:
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc_hard.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc_hard.yaml
new file mode 100644
index 00000000..97754a67
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc_hard.yaml
@@ -0,0 +1,31 @@
+include: _default_template_yaml
+task: "mix_evals_video2text_mc_hard"
+dataset_name: "video2text"
+test_split: multiple_choice_hard
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_video2text_doc_to_text
+doc_to_target: "{{reference_answer}}"
+
+generation_kwargs:
+  max_new_tokens: 5
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.GPTMultiChoiceFilter
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: "Answer with the option's letter from the given choices directly."
+  gpt4v:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: "Answer with the option's letter from the given choices directly."
diff --git a/lmms_eval/tasks/mix_evals/video2text/utils.py b/lmms_eval/tasks/mix_evals/video2text/utils.py
index cd4da6f5..ad6e1f3d 100644
--- a/lmms_eval/tasks/mix_evals/video2text/utils.py
+++ b/lmms_eval/tasks/mix_evals/video2text/utils.py
@@ -6,12 +6,13 @@
 import time
 from pathlib import Path
 
+import openai
 import requests
 import yaml
 from loguru import logger as eval_logger
 
 import lmms_eval.tasks._task_utils.file_utils as file_utils
-from lmms_eval.filters.extraction import ExtendedRegexFilter
+from lmms_eval.filters import Filter
 
 with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
     raw_data = f.readlines()
@@ -26,73 +27,80 @@
 NUM_SECONDS_TO_SLEEP = 5
 GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
 API_TYPE = os.getenv("API_TYPE", "openai")
+API_VERSION = "gpt-3.5-turbo-0125"
+MAX_NEW_TOKENS = 999
 
 if API_TYPE == "openai":
-    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
-    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json",
-    }
+    client = openai.OpenAI()
 elif API_TYPE == "azure":
-    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
-    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
-    headers = {
-        "api-key": API_KEY,
-        "Content-Type": "application/json",
-    }
-
-eval_prompt = """You are an AI assistant who will help me to evaluate the quality of a model response to a few candidate ground truth answers.
-
-Some criterion
-- Response that perfectly reflect the meaning of the ground truth: 1 point
-- Response that reflect none of the key points in the ground truth: 0 point
-- Some part in the response are correct but some parts in the ground truth are not mentioned in the response: 0.5 point
-- Some part in the response are correct but other parts in the response are not mentioned in the ground truth: 0.5 point
-
-Here're some examples about the scoring criterion and format:
-model response: Steam Cleaning Services
-ground truth: ["steam clean", "steam clean", "cleaning", "car", "steam clean"],
-Point: 1
-
-model response: A cowboy action shooter.
-ground truth: ["man"]
-Point: 1
-
-model response: I'm sorry, but I can't assist with that request.
-ground truth: ["quality"]
-Point: 0
-
-Let's begin this task:
-model response: {model_response}
-ground truth: {ground_truth}
-Point:"""
-
-
-def get_eval(model_response: str, ground_truth: str, max_tokens: int, retries: int = 5):
-    global headers
-    content = eval_prompt.format(model_response=model_response, ground_truth=ground_truth)
-
-    messages = [
-        {"role": "user", "content": content},
-    ]
+    if "AZURE_ENDPOINT" in os.environ:
+        API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    else:
+        API_URL = os.getenv("AZURE_OPENAI_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    if "AZURE_OPENAI_API_KEY" in os.environ:
+        API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "YOUR_API_KEY")
+    else:
+        API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    client = openai.AzureOpenAI(api_key=API_KEY, azure_endpoint=API_URL)
+
+
+video2text_gpt_judge_for_closeended_freeform = lambda prompt, gold_ans, response: [
+    {"role": "system", "content": f"In this task, I want you to act as a judge."},
+    {
+        "role": "user",
+        "content": f"""You will be provided with a question, its golden answer(s), and the model's answer, while the context of the question, which is one or more videos, is not given here. Your task is to judge how correct the model's answer is based on the golden answer(s), without seeing the input videos of the question, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Your should first briefly give your reasoning process regarding how the model's answer conforms to or contradicts the golden answer(s), and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples. 
+
+Example 1:
+Question: what does this video want to express
+Golden Answer(s): <answer 1> introduce method of playing
+Model's Answer: Volleyball serve \n
+Your Judgment: The model's answer "Volleyball serve" suggests a specific action, which may be part of what the video demonstrates. However, it misses the broader educational intent implied by the golden answer "introduce method of playing". Therefore, the answer is partially correct. The Correctness Score: [[0.5]]
+
+Example 2:
+Question: who do two other boys with surprised looks assist up?
+Golden Answer(s): <answer 1> boy
+Model's Answer: Boy.
+Your Judgment: The model's answer "Boy." precisely matches the golden answer which states the two other boys assist a "boy". The Correctness Score: [[1.0]]
+
+Example 3:
+Question: what did the lady do at the end of the video after their performance
+Golden Answer(s): <answer 1> picks up her phone
+Model's Answer: Nothing.
+Your Judgment: The model's answer "Nothing." directly contradicts the golden answer which states that the lady "picks up her phone" at the end of the video after their performance. Since the model's response completely misses the specific action described in the golden answer, it is incorrect. The Correctness Score: [[0.0]]
+
+Note that each one of the golden answers is considered correct. Thus if the model's answer matches any one of the golden answers, it should be considered correct. Judge the below case, give the brief reasoning process and the correctness score.
+
+Question: {prompt}
+Golden Answer(s): {gold_ans}
+Model's Answer: {response}
+Your Judgment: 
+""",
+    },
+]
+
+
+def get_eval(question, model_response: str, ground_truth: str, max_tokens: int, retries: int = 5):
+    global client
+    messages = video2text_gpt_judge_for_closeended_freeform(prompt=question, gold_ans=ground_truth, response=model_response)
 
     payload = {
         "model": GPT_EVAL_MODEL_NAME,
         "messages": messages,
-        "temperature": 0.2,
+        # "temperature": 0.2,
         "max_tokens": max_tokens,
     }
 
     for attempt in range(retries):
         try:
-            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-            response.raise_for_status()
+            # response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response = client.chat.completions.create(**payload)
+            # response.raise_for_status()
             response_data = response.json()
 
             content = response_data["choices"][0]["message"]["content"].strip()
+            # content = response.choices[0].message.content.strip()
             if content != "":
-                return content, response_data["model"]
+                return content, response["model"]
             break  # If successful, break out of the loop
 
         except Exception as e:
@@ -156,7 +164,7 @@ def mix_evals_video2text_doc_to_text(doc, lmms_eval_specific_kwargs=None):
 
     user_prompt = doc["query"]
 
-    if "options" in doc:
+    if "options" in doc and len(doc["options"]) > 1:
         option_prompt = "Here are the options:\n"
         for idx, option in enumerate(doc["options"]):
             char_idx = chr(ord("A") + idx)
@@ -235,7 +243,7 @@ def mix_evals_video2text_process_results_freeform(doc, result):
     ground_truth_str = ", ".join([f'"{gt}"' for gt in doc["reference_answer"]])
     ground_truth_str = f"[{ground_truth_str}]"
     content = eval_prompt.format(model_response=pred, ground_truth=ground_truth_str)
-    eval_answer, model_name = get_eval(model_response=pred, ground_truth=ground_truth_str, max_tokens=1024)
+    eval_answer, model_name = get_eval(model_response=pred, ground_truth=ground_truth_str, max_tokens=MAX_NEW_TOKENS, question=doc["query"])
     return {
         "submission": {"pred": pred, "question_idx": doc["id"], "target": doc["reference_answer"], "eval_answer": eval_answer, "gpt_prompt": content},
         "gpt_eval": {"pred": pred, "question_idx": doc["id"], "target": doc["reference_answer"], "eval_answer": eval_answer, "gpt_prompt": content},
@@ -271,30 +279,114 @@ def mix_evals_video2text_aggregate_gen(results, args):
     mix_evals_video2text_aggregate_submissions(results, args, "OpenConvs")
 
 
-class MultiChoiceRegexFilter(ExtendedRegexFilter):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+video2text_gpt_judge_for_closeended_multiplechoice = lambda prompt, options, response: [
+    {"role": "system", "content": f"In this task, I want you to act as an option extractor."},
+    {
+        "role": "user",
+        "content": f"""You will be provided with a multiple-choice question, its options, and the model's answer, while the context of the question, which is one or more videos, is not given here. Your task is to extract or judge which option is chosen by the model based on its response, without seeing the context of the question. The extracted option should be one of the provided option letters. Your should first briefly give your reasoning process, and then give the extracted option letter. The extracted option must strictly follow this format: \"[[option letter]]\", e.g., \"The option chosen by the model: [[A]]\".
+Below are some examples. 
+
+Example 1:
+Question: What did he do to the car?
+Options:
+A. Paint the car
+B. Put plastic over the car
+C. Put metal over the car
+D. Cut the car
+Model's Answer: put plastic over the car.
+Your Judgment: The model's response directly aligns with option B, which is "Put plastic over the car." The response given is a paraphrase of this option without deviating in meaning. The option chosen by the model: [[B]]
+
+Example 2:
+Question: How did Eddie know Pam and Justin before Justin was killed?
+Options:
+A. They were part of the theater company
+B. They were high school friends
+C. They went to college together
+D. They were cousins
+E. They were siblings
+Model's Answer: A.
+Your Judgment: The model's answer directly provides the option letter "A." The option chosen by the model: [[A]]
+
+Example 3:
+Question: why do the people move in the same manner
+Options:
+A. uniform
+B. dancing with the baby
+C. exercising together
+D. stay together
+E. singing and dancing
+Model's Answer: sing and dance
+Your Judgment: The model's response "sing and dance" closely aligns with option E, which is "singing and dancing." The response provided is a direct paraphrase of this option, modifying only slightly the form of the words (from gerund to infinitive) but maintaining the same core activities described in the option. The option chosen by the model: [[E]]
+
+When you think that the model's answer does not match any of the given options, please choose the option that is the closest to the model's answer.
+Give the brief reasoning process and the extracted option for the below case. 
+
+Question: {prompt}
+Options: 
+{options}
+Model's Answer: {response}
+Your Judgment: 
+""",
+    },
+]
+
+
+class GPTMultiChoiceFilter(Filter):
+    def __init__(self, gpt_version: str = "gpt-3.5-turbo-0125", retries: int = 5):
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+        self.gpt_version = gpt_version
+
+        if API_TYPE == "openai":
+            self.client = openai.OpenAI(api_key=API_KEY)
+        elif API_TYPE == "azure":
+            self.client = openai.AzureOpenAI(api_key=API_KEY, azure_endpoint=API_URL)
+
+        self.retries = retries
 
     def apply(self, resps, docs):
-        filtered_resps = []
-
-        for r, doc in zip(resps, docs):
-            # Regex to directly extract the option letter from the model response
-            option_letter_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)")
-
-            # Process each response
-            filtered = []
-            for resp in r:
-                # Try to match the option letter at the start of the response
-                match = option_letter_regex.match(resp)
-                if match:
-                    # If a match is found, append the matched letter
-                    filtered.append(match.group(1))
-                else:
-                    # If no match, return the original response
-                    filtered.append(resp)
-
-            # Assuming we need the first response that matches or the original response
-            filtered_resps.append(filtered[0])
-
-        return filtered_resps
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        results = []
+        for response, doc in zip(resps, docs):
+            query = doc["query"]
+            options = "\n".join([f"{chr(ord('A') + idx)}. {option}" for idx, option in enumerate(doc["options"])])
+            message = video2text_gpt_judge_for_closeended_multiplechoice(prompt=query, options=options, response=response)
+            payload = {
+                "model": self.gpt_version,
+                "messages": message,
+                "max_tokens": MAX_NEW_TOKENS,
+            }
+            result = 0
+            for attempt in range(self.retries):
+                try:
+                    # response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+                    response = client.chat.completions.create(**payload)
+                    # response.raise_for_status()
+                    response_data = response.json()
+
+                    content = response_data["choices"][0]["message"]["content"].strip()
+                    # content = response.choices[0].message.content.strip()
+                    if content != "":
+                        match = re.search(r"r'\b([A-Z])\.?\b'", content)
+                        if match:
+                            result = ord(match.group(1)) - ord("A")
+                        else:
+                            result = 0
+                    break  # If successful, break out of the loop
+
+                except Exception as e:
+                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+                    if attempt < self.retries:  # If we have retries left, sleep and then continue to next attempt
+                        time.sleep(NUM_SECONDS_TO_SLEEP)
+                    else:  # If this was the last attempt, log and return empty
+                        eval_logger.error(f"All {self.retries} attempts failed. Last error message: {e}")
+                        result = 0
+                        break
+            results.append(result)
+        return results

From b89847a308b97ae783598c844ad75045b048a408 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Sun, 1 Dec 2024 19:56:11 +0800
Subject: [PATCH 04/10] Add image-to-text evaluation tasks and templates

---
 .../image2text/_default_template_yaml         |  12 +
 .../image2text/mix_evals_image2text.yaml      |   5 +
 .../mix_evals_image2text_freeform.yaml        |  17 +
 .../mix_evals_image2text_freeform_hard.yaml   |  25 ++
 .../image2text/mix_evals_image2text_hard.yaml |   5 +
 .../image2text/mix_evals_image2text_mc.yaml   |  23 ++
 .../mix_evals_image2text_mc_hard.yaml         |  23 ++
 .../mix_evals_image2text_openended.yaml       |  14 +
 lmms_eval/tasks/mix_evals/image2text/utils.py | 389 ++++++++++++++++++
 .../video2text/_default_template_yaml         |   1 -
 .../mix_evals_video2text_freeform.yaml        |   2 +-
 lmms_eval/tasks/mix_evals/video2text/utils.py |  60 ++-
 12 files changed, 553 insertions(+), 23 deletions(-)
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/_default_template_yaml
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform_hard.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_hard.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc_hard.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_openended.yaml
 create mode 100644 lmms_eval/tasks/mix_evals/image2text/utils.py

diff --git a/lmms_eval/tasks/mix_evals/image2text/_default_template_yaml b/lmms_eval/tasks/mix_evals/image2text/_default_template_yaml
new file mode 100644
index 00000000..c75156f4
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/_default_template_yaml
@@ -0,0 +1,12 @@
+dataset_path: MixEval/MixEval-X
+dataset_kwargs:
+  cache_dir: mix_evals_image2text
+lmms_eval_specific_kwargs:
+  default:
+    post_prompt: ""
+    pre_prompt: ""
+  gpt4v:
+    post_prompt: ""
+    pre_prompt: ""
+metadata:
+  version: 0
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text.yaml
new file mode 100644
index 00000000..053e13df
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text.yaml
@@ -0,0 +1,5 @@
+group: mix_evals_image2text
+task:
+- mix_evals_image2text_mc
+- mix_evals_image2text_freeform
+# - mix_evals_video2text_openended
\ No newline at end of file
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform.yaml
new file mode 100644
index 00000000..1a21c46f
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform.yaml
@@ -0,0 +1,17 @@
+task: "mix_evals_image2text_freeform"
+dataset_name: "image2text"
+test_split: free_form
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_image2text_doc_to_text
+doc_to_target: "{{reference_answer}}"
+process_results: !function utils.mix_evals_image2text_process_results_freeform
+metric_list:
+  - metric: gpt_eval
+    aggregation: !function utils.mix_evals_image2text_gpt_eval
+    higher_is_better: true
+
+generation_kwargs:
+  max_new_tokens: 16
+
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform_hard.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform_hard.yaml
new file mode 100644
index 00000000..e0aeea52
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform_hard.yaml
@@ -0,0 +1,25 @@
+task: "mix_evals_image2text_freeform_hard"
+dataset_name: "image2text"
+test_split: free_form_hard
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_image2text_doc_to_text
+doc_to_target: "{{reference_answer}}"
+process_results: !function utils.mix_evals_image2text_process_results_freeform
+metric_list:
+  - metric: gpt_eval
+    aggregation: !function utils.mix_evals_image2text_gpt_eval
+    higher_is_better: true
+
+generation_kwargs:
+  max_new_tokens: 16
+
+include: _default_template_yaml
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Please answer the following questions about the image."
+    post_prompt: ""
+  gpt4v:
+    pre_prompt: "Please answer the following questions about the image."
+    post_prompt: ""
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_hard.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_hard.yaml
new file mode 100644
index 00000000..77d7f845
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_hard.yaml
@@ -0,0 +1,5 @@
+group: mix_evals_image2text_hard
+task:
+- mix_evals_image2text_mc_hard
+- mix_evals_image2text_freeform_hard
+# - mix_evals_image2text_openended
\ No newline at end of file
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml
new file mode 100644
index 00000000..4ac669b5
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml
@@ -0,0 +1,23 @@
+include: _default_template_yaml
+task: "mix_evals_image2text_mc"
+dataset_name: "image2text"
+test_split: multiple_choice
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_image2text_doc_to_text
+doc_to_target: "{{reference_answer}}"
+
+generation_kwargs:
+  max_new_tokens: 5
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.GPTMultiChoiceFilter
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc_hard.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc_hard.yaml
new file mode 100644
index 00000000..58fae82a
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc_hard.yaml
@@ -0,0 +1,23 @@
+include: _default_template_yaml
+task: "mix_evals_image2text_mc_hard"
+dataset_name: "image2text"
+test_split: multiple_choice_hard
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_image2text_doc_to_text
+doc_to_target: "{{reference_answer}}"
+
+generation_kwargs:
+  max_new_tokens: 5
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.GPTMultiChoiceFilter
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_openended.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_openended.yaml
new file mode 100644
index 00000000..5e05aea5
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_openended.yaml
@@ -0,0 +1,14 @@
+include: _default_template_yaml
+dataset_name: "open_ended"
+task: "mix_evals_image2text_openended"
+test_split: image2text
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_image2text_doc_to_text_open_convs
+doc_to_target: ""
+process_results: !function utils.mix_evals_video2text_process_results_open_convs
+
+metric_list:
+  - metric: submission
+    aggregation: !function utils.mix_evals_video2text_aggregate_gen
+    higher_is_better: true
diff --git a/lmms_eval/tasks/mix_evals/image2text/utils.py b/lmms_eval/tasks/mix_evals/image2text/utils.py
new file mode 100644
index 00000000..32333044
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/image2text/utils.py
@@ -0,0 +1,389 @@
+import ast
+import datetime
+import json
+import os
+import random
+import re
+import sys
+import time
+from pathlib import Path
+
+import openai
+import yaml
+from loguru import logger as eval_logger
+from PIL import Image
+
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+from lmms_eval.filters import Filter
+
+with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+NUM_SECONDS_TO_SLEEP = 5
+API_TYPE = os.getenv("API_TYPE", "openai")
+MODEL_VERSION = "gpt-3.5-turbo-0125"
+MAX_NEW_TOKENS = 999
+
+if API_TYPE == "openai":
+    client = openai.OpenAI()
+elif API_TYPE == "azure":
+    if "AZURE_ENDPOINT" in os.environ:
+        API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    else:
+        API_URL = os.getenv("AZURE_OPENAI_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    if "AZURE_OPENAI_API_KEY" in os.environ:
+        API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "YOUR_API_KEY")
+    else:
+        API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    client = openai.AzureOpenAI(api_key=API_KEY, azure_endpoint=API_URL)
+
+
+image2text_gpt_judge_for_closeended_freeform = lambda prompt, gold_ans, response: [
+    {"role": "system", "content": f"In this task, I want you to act as a judge."},
+    {
+        "role": "user",
+        "content": f"""You will be provided with a question, its golden answer(s), and the model's answer, while the context of the question, which is one or more images, is not given here. Your task is to judge how correct the model's answer is based on the golden answer(s), without seeing the input images of the question, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Your should first briefly give your reasoning process regarding how the model's answer conforms to or contradicts the golden answer(s), and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples. 
+
+Example 1: 
+Question: what is this advertising?
+Golden Answer(s): <answer 1> garden annual; <answer 2> seeds; <answer 3> seeds; <answer 4> seeds; <answer 5> seeds; <answer 6> seeds; <answer 7> seeds; <answer 8> seeds; <answer 9> seeds; <answer 10> cole's garden annual
+Model's Answer: Seed
+Your Judgment: The golden answers consistently mention "seeds" suggesting an advertisement for a seed catalog. The model's answer, "Seed", aligns exactly with this description. The Correctness Score: [[1.0]]
+
+Example 2: 
+Question: Who is making a face?
+Golden Answer: <answer 1> child
+Model's Answer: A man.
+Your Judgment: The golden answer specifies a "child" making a face, but the model answered "A man", which is incorrect as it refers to a different age group. The Correctness Score: [[0.0]]
+
+Example 3: 
+Question: what road is to the right?
+Golden Answer: <answer 1> troublesome valley rd; <answer 2> troublesome valley rd.; <answer 3> troublesome valley; <answer 4> troublesome valley road; <answer 5> valley road; <answer 6> troublesome valley; <answer 7> troublesome valley road; <answer 8> troublesome valley ; <answer 9> troublesome valley rd; <answer 10> troublesome valley rd.
+Model's Answer: troublesome road
+Your Judgment: The golden answers all specify the name of the road as "troublesome valley rd" or variations of this phrase with consistent reference to "troublesome valley." The model's answer, "troublesome road," captures the "troublesome" aspect but omits the critical "valley" part of the name, which is crucial for full accuracy. Thus, the model's answer partially matches the golden answer but lacks complete specificity. The Correctness Score: [[0.6]]
+
+Note that each one of the golden answers is considered correct. Thus if the model's answer matches any one of the golden answers, it should be considered correct. Judge the below case, give the brief reasoning process and the correctness score.
+
+Question: {prompt}
+Golden Answer(s): {gold_ans}
+Model's Answer: {response}
+Your Judgment: 
+""",
+    },
+]
+
+image2text_gpt_judge_for_closeended_multiplechoice = lambda prompt, options, response: [
+    {"role": "system", "content": f"In this task, I want you to act as an option extractor."},
+    {
+        "role": "user",
+        "content": f"""You will be provided with a multiple-choice question, its options, and the model's answer, while the context of the question, which is one or more images, is not given here. Your task is to extract or judge which option is chosen by the model based on its response, without seeing the context of the question. The extracted option should be one of the provided option letters. Your should first briefly give your reasoning process, and then give the extracted option letter. The extracted option must strictly follow this format: \"[[option letter]]\", e.g., \"The option chosen by the model: [[A]]\".
+Below are some examples. 
+
+Example 1: 
+Question: Where are the cast of the television show located in the image?
+Options:
+A. In the foreground
+B. In the background
+C. In the center
+D. At the edges
+Model's Answer: C. In the center
+Your Judgment: The model's answer clearly states "C. In the center", indicating that the correct option, according to the model, is in the center. The option chosen by the model: [[C]].
+
+Example 2: 
+Question: <image_1> on the left was painted during the 
+Options:
+A. first or second century C. E.
+B. sixth or seventh century C. E.
+C. tenth or eleventh century C.E.
+D. fourteenth or fifteenth century C. E.
+Model's Answer: The correct answer is option D, the fourteenth or fifteenth century C.E.
+Your Judgment: The model's response specifies "option D, the fourteenth or fifteenth century C.E." directly as the correct answer. The option chosen by the model: [[D]].   
+
+Example 3: 
+Question: what does the diagram show's you information about
+Options:
+A. Photosynthesis
+B. The plant getting fed
+C. A picture of the plant
+D. What happens to a plant daily
+Model's Answer: The diagram shows the process of photosynthesis, which is the process by which plants convert sunlight, carbon dioxide, and water into oxygen and glucose. 
+Your Judgment: The model's answer mentions "the process of photosynthesis," which directly corresponds to option A, "Photosynthesis". Therefore, the correct option according to the model is photosynthesis.  The option chosen by the model: [[A]].
+
+Give the brief reasoning process and the extracted option for the below case:
+
+Question: {prompt}
+Options: 
+{options}
+Model's Answer: {response}
+Your Judgment: 
+""",
+    },
+]
+
+
+def get_score_from_judge(judge_response):
+    """
+    Get the score from the judge response.
+    """
+    one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
+    one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
+
+    match = re.search(one_score_pattern, judge_response)
+    if not match:
+        match = re.search(one_score_pattern_backup, judge_response)
+
+    if match:
+        rating = ast.literal_eval(match.groups()[0])
+    else:
+        rating = round(random.random(), 1)
+
+    return float(rating)
+
+
+def get_eval(question, model_response: str, ground_truth: str, max_tokens: int, retries: int = 5):
+    global client
+    messages = image2text_gpt_judge_for_closeended_freeform(prompt=question, gold_ans=ground_truth, response=model_response)
+
+    payload = {
+        "model": MODEL_VERSION,
+        "messages": messages,
+        # "temperature": 0.2,
+        "max_tokens": max_tokens,
+    }
+
+    for attempt in range(retries):
+        try:
+            # response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response = client.chat.completions.create(**payload)
+            # response.raise_for_status()
+            response_data = response.json()
+
+            # content = response_data["choices"][0]["message"]["content"].strip()
+            content = response.choices[0].message.content.strip()
+            if content != "":
+                return content
+            break  # If successful, break out of the loop
+
+        except Exception as e:
+            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+            if attempt < retries:  # If we have retries left, sleep and then continue to next attempt
+                time.sleep(NUM_SECONDS_TO_SLEEP)
+            else:  # If this was the last attempt, log and return empty
+                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
+                return "[[0.0]]"
+    return "[[0.0]]"
+
+
+# A bit ugly here
+# But the idea is that we will unzip all the zip files
+# To HF HOME cache dir
+# And load it here
+HF_HOME = os.environ["HF_HOME"]
+cache_dir = config["dataset_kwargs"]["cache_dir"]
+cache_dir = os.path.join(HF_HOME, cache_dir)
+cache_dir = os.path.join(cache_dir)
+
+
+def mix_evals_image2text_doc_to_visual(doc):
+    visual = []
+    for image_path in doc["input_file"]:
+        image_path = os.path.join(cache_dir, image_path)
+        if os.path.exists(image_path):
+            image_path = image_path
+
+        visual.append(Image.open(image_path).convert("RGB"))
+
+    return visual
+
+
+# This is the place where you format your question
+def mix_evals_image2text_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+    pre_prompt = ""
+    post_prompt = ""
+    if "pre_prompt" in lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
+    if "post_prompt" in lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs["post_prompt"]
+
+    user_prompt = doc["query"]
+
+    if "options" in doc and len(doc["options"]) > 1:
+        option_prompt = "Here are the options:\n"
+        for idx, option in enumerate(doc["options"]):
+            char_idx = chr(ord("A") + idx)
+            option = option.strip()
+            option_prompt += f"{char_idx}. {option}\n"
+
+        option_prompt = option_prompt.rstrip("\n")
+        user_prompt = f"{user_prompt}\n{option_prompt}"
+
+    if pre_prompt:
+        user_prompt = f"{pre_prompt}\n{user_prompt}"
+
+    if post_prompt:
+        user_prompt = f"{user_prompt}\n{post_prompt}"
+    return user_prompt
+
+
+OPEN_CONVS_PROMPT = """{PRE}
+{FIRST}
+{POST}
+"""
+
+
+def mix_evals_image2text_doc_to_text_open_convs(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+    pre_prompt = ""
+    post_prompt = ""
+    if "pre_prompt" in lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
+    if "post_prompt" in lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs["post_prompt"]
+
+    filtered_first_turn = re.sub(r"<video_[0-9]+>", "", doc["first_turn_user_prompt"])
+    return OPEN_CONVS_PROMPT.format(
+        PRE=pre_prompt,
+        POST=post_prompt,
+        FIRST=filtered_first_turn,
+    )
+
+
+MODEL_CONVS_PROMPT = """{FIRST}
+{MODEL_RESPONSE}
+{PRE}
+{SECOND}
+{POST}
+"""
+
+
+def mix_evals_image2text_doc_to_text_open_2nd_convs(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+    pre_prompt = ""
+    post_prompt = ""
+    if "pre_prompt" in lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
+    if "post_prompt" in lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs["post_prompt"]
+
+    return MODEL_CONVS_PROMPT.format(
+        PRE=pre_prompt,
+        POST=post_prompt,
+        FIRST=doc["first_turn_user_prompt"],
+        SECOND=doc["second_turn_user_prompt"],
+        MODEL_RESPONSE=doc["model_response"],
+    )
+
+
+def mix_evals_image2text_process_results_open_convs(doc, result):
+    pred = result[0]
+    return {"submission": {"pred": pred, "question_idx": doc["question_index"], "first_turn_video_caption": doc["first_turn_video_caption"], "target": ""}}
+
+
+def mix_evals_image2text_process_results_freeform(doc, result):
+    pred = result[0]
+    ground_truth_str = ", ".join([f'"{gt}"' for gt in doc["reference_answer"]])
+    ground_truth_str = f"[{ground_truth_str}]"
+    content = image2text_gpt_judge_for_closeended_freeform(response=pred, gold_ans=ground_truth_str, prompt=doc["query"])
+    eval_answer = get_eval(model_response=pred, ground_truth=ground_truth_str, max_tokens=MAX_NEW_TOKENS, question=doc["query"])
+    return {
+        "submission": {"pred": pred, "question_idx": doc["id"], "target": doc["reference_answer"], "eval_answer": eval_answer, "gpt_prompt": content},
+        "gpt_eval": {"pred": pred, "question_idx": doc["id"], "target": doc["reference_answer"], "eval_answer": eval_answer, "gpt_prompt": content},
+    }
+
+
+def mix_evals_image2text_aggregate_submissions(results, args, task):
+    now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    submission_file_name = f"mix_evals_video2text_{task}-{now_date_time}.json"
+    path = file_utils.generate_submission_file(submission_file_name, args)
+    with open(path, "w") as f:
+        json.dump(results, f)
+    eval_logger.info(f"Submission file saved to {path}")
+
+
+def mix_evals_image2text_gpt_eval(results, args):
+    score = 0
+    for result in results:
+        eval_answer = result["eval_answer"]
+        eval_score = get_score_from_judge(eval_answer)
+        score += eval_score
+
+    return score / len(results)
+
+
+# Factory into different aggregate
+def mix_evals_image2text_aggregate_gen(results, args):
+    mix_evals_image2text_aggregate_submissions(results, args, "OpenConvs")
+
+
+class GPTMultiChoiceFilter(Filter):
+    def __init__(self, gpt_version: str = "gpt-3.5-turbo-0125", retries: int = 5):
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+        self.gpt_version = gpt_version
+
+        if API_TYPE == "openai":
+            self.client = openai.OpenAI(api_key=API_KEY)
+        elif API_TYPE == "azure":
+            self.client = openai.AzureOpenAI(api_key=API_KEY, azure_endpoint=API_URL)
+
+        self.retries = retries
+
+    def apply(self, resps, docs):
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        results = []
+        for response, doc in zip(resps, docs):
+            query = doc["query"]
+            options = "\n".join([f"{chr(ord('A') + idx)}. {option}" for idx, option in enumerate(doc["options"])])
+            message = image2text_gpt_judge_for_closeended_multiplechoice(prompt=query, options=options, response=response)
+            payload = {
+                "model": self.gpt_version,
+                "messages": message,
+                "max_tokens": MAX_NEW_TOKENS,
+            }
+            result = 0
+            for attempt in range(self.retries):
+                try:
+                    # response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+                    response = client.chat.completions.create(**payload)
+                    # response.raise_for_status()
+
+                    # content =["choices"][0]["message"]["content"].strip()
+                    content = response.choices[0].message.content.strip()
+                    if content != "":
+                        match = re.search(r"r'\b([A-Z])\.?\b'", content)
+                        if match:
+                            result = ord(match.group(1)) - ord("A")
+                        else:
+                            result = 0
+                    break  # If successful, break out of the loop
+
+                except Exception as e:
+                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+                    import traceback
+
+                    print(traceback.format_exc())
+                    if attempt < self.retries:  # If we have retries left, sleep and then continue to next attempt
+                        time.sleep(NUM_SECONDS_TO_SLEEP)
+                    else:  # If this was the last attempt, log and return empty
+                        eval_logger.error(f"All {self.retries} attempts failed. Last error message: {e}")
+                        result = 0
+                        break
+            results.append(str(result))
+        return results
diff --git a/lmms_eval/tasks/mix_evals/video2text/_default_template_yaml b/lmms_eval/tasks/mix_evals/video2text/_default_template_yaml
index 73473c72..4c84f1a2 100644
--- a/lmms_eval/tasks/mix_evals/video2text/_default_template_yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/_default_template_yaml
@@ -12,5 +12,4 @@ lmms_eval_specific_kwargs:
     pre_prompt: These are frames from a video. Please answer the following questions about the video.
 metadata:
   gpt_eval_model_name: gpt-4o-mini
-  modality: video
   version: 0
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
index e4800d60..30a50046 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
@@ -19,7 +19,7 @@ include: _default_template_yaml
 lmms_eval_specific_kwargs:
   default:
     pre_prompt: "These are frames from a video. Please answer the following questions about the video."
-    post_prompt: "Answer the question using a single word or phrase."
+    post_prompt: ""
   gpt4v:
     pre_prompt: "These are frames from a video. Please answer the following questions about the video with a short phrase."
     post_prompt: ""
diff --git a/lmms_eval/tasks/mix_evals/video2text/utils.py b/lmms_eval/tasks/mix_evals/video2text/utils.py
index ad6e1f3d..70591b69 100644
--- a/lmms_eval/tasks/mix_evals/video2text/utils.py
+++ b/lmms_eval/tasks/mix_evals/video2text/utils.py
@@ -1,6 +1,8 @@
+import ast
 import datetime
 import json
 import os
+import random
 import re
 import sys
 import time
@@ -10,6 +12,7 @@
 import requests
 import yaml
 from loguru import logger as eval_logger
+from PIL import Image
 
 import lmms_eval.tasks._task_utils.file_utils as file_utils
 from lmms_eval.filters import Filter
@@ -25,9 +28,8 @@
     config = yaml.safe_load("".join(safe_data))
 
 NUM_SECONDS_TO_SLEEP = 5
-GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
 API_TYPE = os.getenv("API_TYPE", "openai")
-API_VERSION = "gpt-3.5-turbo-0125"
+MODEL_VERSION = "gpt-3.5-turbo-0125"
 MAX_NEW_TOKENS = 999
 
 if API_TYPE == "openai":
@@ -79,12 +81,31 @@
 ]
 
 
+def get_score_from_judge(judge_response):
+    """
+    Get the score from the judge response.
+    """
+    one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
+    one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
+
+    match = re.search(one_score_pattern, judge_response)
+    if not match:
+        match = re.search(one_score_pattern_backup, judge_response)
+
+    if match:
+        rating = ast.literal_eval(match.groups()[0])
+    else:
+        rating = round(random.random(), 1)
+
+    return float(rating)
+
+
 def get_eval(question, model_response: str, ground_truth: str, max_tokens: int, retries: int = 5):
     global client
     messages = video2text_gpt_judge_for_closeended_freeform(prompt=question, gold_ans=ground_truth, response=model_response)
 
     payload = {
-        "model": GPT_EVAL_MODEL_NAME,
+        "model": MODEL_VERSION,
         "messages": messages,
         # "temperature": 0.2,
         "max_tokens": max_tokens,
@@ -97,10 +118,10 @@ def get_eval(question, model_response: str, ground_truth: str, max_tokens: int,
             # response.raise_for_status()
             response_data = response.json()
 
-            content = response_data["choices"][0]["message"]["content"].strip()
-            # content = response.choices[0].message.content.strip()
+            # content = response_data["choices"][0]["message"]["content"].strip()
+            content = response.choices[0].message.content.strip()
             if content != "":
-                return content, response["model"]
+                return content
             break  # If successful, break out of the loop
 
         except Exception as e:
@@ -109,8 +130,8 @@ def get_eval(question, model_response: str, ground_truth: str, max_tokens: int,
                 time.sleep(NUM_SECONDS_TO_SLEEP)
             else:  # If this was the last attempt, log and return empty
                 eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
-                return "", ""
-    return "", ""
+                return "[[0.0]]"
+    return "[[0.0]]"
 
 
 # A bit ugly here
@@ -137,7 +158,7 @@ def mix_evals_doc_to_visual(doc, modality):
         if modality == "video":
             visual.append(video_path)
         elif modality == "image":
-            visual.append(video_path)
+            visual.append(Image.open(video_path).convert("RGB"))
         else:
             sys.exit(f"modality:{modality} is not supported, please check")
     return visual
@@ -242,8 +263,8 @@ def mix_evals_video2text_process_results_freeform(doc, result):
     pred = result[0]
     ground_truth_str = ", ".join([f'"{gt}"' for gt in doc["reference_answer"]])
     ground_truth_str = f"[{ground_truth_str}]"
-    content = eval_prompt.format(model_response=pred, ground_truth=ground_truth_str)
-    eval_answer, model_name = get_eval(model_response=pred, ground_truth=ground_truth_str, max_tokens=MAX_NEW_TOKENS, question=doc["query"])
+    content = video2text_gpt_judge_for_closeended_freeform(response=pred, gold_ans=ground_truth_str, prompt=doc["query"])
+    eval_answer = get_eval(model_response=pred, ground_truth=ground_truth_str, max_tokens=MAX_NEW_TOKENS, question=doc["query"])
     return {
         "submission": {"pred": pred, "question_idx": doc["id"], "target": doc["reference_answer"], "eval_answer": eval_answer, "gpt_prompt": content},
         "gpt_eval": {"pred": pred, "question_idx": doc["id"], "target": doc["reference_answer"], "eval_answer": eval_answer, "gpt_prompt": content},
@@ -263,12 +284,7 @@ def mix_evals_video2text_gpt_eval(results, args):
     score = 0
     for result in results:
         eval_answer = result["eval_answer"]
-        eval_score = re.search(r"([0-9.]+)", eval_answer).group(1)
-        try:
-            eval_score = float(eval_score)
-        except Exception as e:
-            eval_logger.error(f"Error parsing eval_score: {e}")
-            eval_score = 0.0
+        eval_score = get_score_from_judge(eval_answer)
         score += eval_score
 
     return score / len(results)
@@ -368,10 +384,9 @@ def apply(self, resps, docs):
                     # response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
                     response = client.chat.completions.create(**payload)
                     # response.raise_for_status()
-                    response_data = response.json()
 
-                    content = response_data["choices"][0]["message"]["content"].strip()
-                    # content = response.choices[0].message.content.strip()
+                    # content =["choices"][0]["message"]["content"].strip()
+                    content = response.choices[0].message.content.strip()
                     if content != "":
                         match = re.search(r"r'\b([A-Z])\.?\b'", content)
                         if match:
@@ -382,11 +397,14 @@ def apply(self, resps, docs):
 
                 except Exception as e:
                     eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+                    import traceback
+
+                    print(traceback.format_exc())
                     if attempt < self.retries:  # If we have retries left, sleep and then continue to next attempt
                         time.sleep(NUM_SECONDS_TO_SLEEP)
                     else:  # If this was the last attempt, log and return empty
                         eval_logger.error(f"All {self.retries} attempts failed. Last error message: {e}")
                         result = 0
                         break
-            results.append(result)
+            results.append(str(result))
         return results

From 5e2341316ab1acff6782555e6c1ae5695c3a48e8 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Sun, 1 Dec 2024 21:07:25 +0800
Subject: [PATCH 05/10] Refactor image-to-text evaluation tasks and update
 configurations

---
 lmms_eval/tasks/__init__.py                        |  3 +++
 .../mix_evals/image2text/_default_template_yaml    |  1 +
 .../mix_evals/image2text/mix_evals_image2text.yaml |  1 -
 .../image2text/mix_evals_image2text_openended.yaml | 14 --------------
 lmms_eval/tasks/mix_evals/image2text/utils.py      |  7 ++++---
 5 files changed, 8 insertions(+), 18 deletions(-)
 delete mode 100644 lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_openended.yaml

diff --git a/lmms_eval/tasks/__init__.py b/lmms_eval/tasks/__init__.py
index 3749248e..85537d0c 100755
--- a/lmms_eval/tasks/__init__.py
+++ b/lmms_eval/tasks/__init__.py
@@ -417,6 +417,8 @@ def _get_task_and_group(self, task_dir: str):
                             "yaml_path": yaml_path,
                         }
                     elif self._config_is_group(config):
+                        if f.endswith("mix_evals_image2text.yaml"):
+                            print(config)
                         # This is a group config
                         tasks_and_groups[config["group"]] = {
                             "type": "group",
@@ -477,6 +479,7 @@ def _get_task_and_group(self, task_dir: str):
                     else:
                         self.logger.debug(f"File {f} in {root} could not be loaded as a task or group")
 
+        print(tasks_and_groups["mix_evals_image2text"])
         return tasks_and_groups
 
 
diff --git a/lmms_eval/tasks/mix_evals/image2text/_default_template_yaml b/lmms_eval/tasks/mix_evals/image2text/_default_template_yaml
index c75156f4..ee3858f9 100644
--- a/lmms_eval/tasks/mix_evals/image2text/_default_template_yaml
+++ b/lmms_eval/tasks/mix_evals/image2text/_default_template_yaml
@@ -1,5 +1,6 @@
 dataset_path: MixEval/MixEval-X
 dataset_kwargs:
+  video: true # a bit confusing, but this is because the official uses path to store image data, so we need to load it as a video dataset
   cache_dir: mix_evals_image2text
 lmms_eval_specific_kwargs:
   default:
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text.yaml
index 053e13df..141c8c56 100644
--- a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text.yaml
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text.yaml
@@ -2,4 +2,3 @@ group: mix_evals_image2text
 task:
 - mix_evals_image2text_mc
 - mix_evals_image2text_freeform
-# - mix_evals_video2text_openended
\ No newline at end of file
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_openended.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_openended.yaml
deleted file mode 100644
index 5e05aea5..00000000
--- a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_openended.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-include: _default_template_yaml
-dataset_name: "open_ended"
-task: "mix_evals_image2text_openended"
-test_split: image2text
-output_type: generate_until
-doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
-doc_to_text: !function utils.mix_evals_image2text_doc_to_text_open_convs
-doc_to_target: ""
-process_results: !function utils.mix_evals_video2text_process_results_open_convs
-
-metric_list:
-  - metric: submission
-    aggregation: !function utils.mix_evals_video2text_aggregate_gen
-    higher_is_better: true
diff --git a/lmms_eval/tasks/mix_evals/image2text/utils.py b/lmms_eval/tasks/mix_evals/image2text/utils.py
index 32333044..ea1b306d 100644
--- a/lmms_eval/tasks/mix_evals/image2text/utils.py
+++ b/lmms_eval/tasks/mix_evals/image2text/utils.py
@@ -304,7 +304,7 @@ def mix_evals_image2text_process_results_freeform(doc, result):
 
 def mix_evals_image2text_aggregate_submissions(results, args, task):
     now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-    submission_file_name = f"mix_evals_video2text_{task}-{now_date_time}.json"
+    submission_file_name = f"mix_evals_image2text_{task}-{now_date_time}.json"
     path = file_utils.generate_submission_file(submission_file_name, args)
     with open(path, "w") as f:
         json.dump(results, f)
@@ -365,8 +365,9 @@ def apply(self, resps, docs):
                     # response.raise_for_status()
 
                     # content =["choices"][0]["message"]["content"].strip()
-                    content = response.choices[0].message.content.strip()
-                    if content != "":
+                    content = response.choices[0].message.content
+                    if content:
+                        content = content.strip()
                         match = re.search(r"r'\b([A-Z])\.?\b'", content)
                         if match:
                             result = ord(match.group(1)) - ord("A")

From 4fddffff853957fd3006dc50e679ccb6542bced7 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Sun, 1 Dec 2024 21:17:34 +0800
Subject: [PATCH 06/10] Enhance LlamaVision model with video loading
 improvements and configuration updates

---
 lmms_eval/models/llama_vision.py              | 31 +++++++------------
 lmms_eval/tasks/mix_evals/video2text/utils.py |  5 +--
 2 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/lmms_eval/models/llama_vision.py b/lmms_eval/models/llama_vision.py
index 2051dd2c..c67f7965 100644
--- a/lmms_eval/models/llama_vision.py
+++ b/lmms_eval/models/llama_vision.py
@@ -15,6 +15,7 @@
 from lmms_eval.api.instance import Instance
 from lmms_eval.api.model import lmms
 from lmms_eval.api.registry import register_model
+from lmms_eval.models.model_utils.load_video import read_video_pyav_pil
 
 warnings.filterwarnings("ignore")
 
@@ -25,22 +26,6 @@
 
 @register_model("llama_vision")
 class LlamaVision(lmms):
-    """
-    Llava Model for Hugging Face Transformers: https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/llava
-
-    Adapted from the InstructBLIP model in lmms_eval/models/instructblip.py
-
-    Example usage:
-
-    accelerate launch --num_processes=8 --main_process_port 12345 -m lmms_eval \
-        --model llava_hf \
-        --model_args pretrained=llava-hf/llava-1.5-7b-hf \
-        --tasks seedbench \
-        --batch_size 1 \
-        --output_path ./logs/ \
-        --log_samples
-    """
-
     def __init__(
         self,
         pretrained: str = "meta-llama/Llama-3.2-11B-Vision",
@@ -48,10 +33,12 @@ def __init__(
         device: str = "cuda",
         dtype: Optional[Union[str, torch.dtype]] = "auto",
         batch_size: int = 1,
-        trust_remote_code: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = True,
         attn_implementation: Optional[str] = None,
         device_map: str = "",
         max_frames_num: Optional[int] = 32,
+        fps: Optional[int] = None,
+        max_image_size: Optional[int] = None,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -68,7 +55,9 @@ def __init__(
         if isinstance(dtype, str) and dtype != "auto":
             dtype = getattr(torch, dtype)
 
+        self.fps = fps
         self.max_frames_num = max_frames_num
+        self.max_image_size = max_image_size
         self._model = MllamaForConditionalGeneration.from_pretrained(pretrained, revision=revision, torch_dtype=dtype, device_map=self.device_map, trust_remote_code=trust_remote_code, attn_implementation=attn_implementation)
         self.model.eval()
         self.processor = AutoProcessor.from_pretrained(pretrained)
@@ -193,9 +182,11 @@ def generate_until(self, requests: List[Instance]) -> List[str]:
 
             for visual in visuals:
                 if isinstance(visual, str):
-                    frames = self.load_video(visual, self.max_frames_num)
-                    frames = torch.from_numpy(frames).permute(0, 3, 1, 2)
-                    images.extend([to_pil_image(frame) for frame in frames])
+                    frames = read_video_pyav_pil(visual, num_frm=self.max_frames_num, fps=self.fps, max_image_size=self.max_image_size)
+                    images.extend(frames)
+                    # frames = self.load_video(visual, self.max_frames_num)
+                    # frames = torch.from_numpy(frames).permute(0, 3, 1, 2)
+                    # images.extend([to_pil_image(frame) for frame in frames])
                 elif isinstance(visual, PIL.Image.Image):
                     images.append(visual)
 
diff --git a/lmms_eval/tasks/mix_evals/video2text/utils.py b/lmms_eval/tasks/mix_evals/video2text/utils.py
index 70591b69..4f7eaf8b 100644
--- a/lmms_eval/tasks/mix_evals/video2text/utils.py
+++ b/lmms_eval/tasks/mix_evals/video2text/utils.py
@@ -386,8 +386,9 @@ def apply(self, resps, docs):
                     # response.raise_for_status()
 
                     # content =["choices"][0]["message"]["content"].strip()
-                    content = response.choices[0].message.content.strip()
-                    if content != "":
+                    content = response.choices[0].message.content
+                    if content:
+                        content = content.strip()
                         match = re.search(r"r'\b([A-Z])\.?\b'", content)
                         if match:
                             result = ord(match.group(1)) - ord("A")

From b4ccbe157c0e06b68e1f1fa1efd376dedd7fe0ca Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Sun, 1 Dec 2024 23:37:47 +0800
Subject: [PATCH 07/10] fix internvl2

---
 lmms_eval/models/internvl2.py                 |  4 ++--
 lmms_eval/tasks/mix_evals/image2text/utils.py | 15 +++++++++++++--
 lmms_eval/tasks/mix_evals/video2text/utils.py | 15 +++++++++++++--
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/lmms_eval/models/internvl2.py b/lmms_eval/models/internvl2.py
index 5de4ce1f..ae4cc0c8 100644
--- a/lmms_eval/models/internvl2.py
+++ b/lmms_eval/models/internvl2.py
@@ -139,8 +139,8 @@ def __init__(
         super().__init__()
 
         self.path = pretrained
-        self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True).eval().cuda()
-        self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True)
+        self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, device_map=device_map).eval()
+        self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True, device_map=device_map)
 
         batch_size = int(batch_size)
         assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}."
diff --git a/lmms_eval/tasks/mix_evals/image2text/utils.py b/lmms_eval/tasks/mix_evals/image2text/utils.py
index ea1b306d..50b181ae 100644
--- a/lmms_eval/tasks/mix_evals/image2text/utils.py
+++ b/lmms_eval/tasks/mix_evals/image2text/utils.py
@@ -361,16 +361,27 @@ def apply(self, resps, docs):
             for attempt in range(self.retries):
                 try:
                     # response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+                    # print(payload)
                     response = client.chat.completions.create(**payload)
+                    # print(response)
                     # response.raise_for_status()
 
                     # content =["choices"][0]["message"]["content"].strip()
                     content = response.choices[0].message.content
+                    # print("content:", content)
                     if content:
-                        content = content.strip()
-                        match = re.search(r"r'\b([A-Z])\.?\b'", content)
+                        match = re.search(r"\[\[([A-Z])\]\]", content)
+                        # print("match:", match)
+                        if not match:
+                            match = re.search(r"r'\b([A-Z])\.?\b'", content)
+                            # print("match:", match)
                         if match:
+                            # print("=====")
+                            # print(match.group(1))
                             result = ord(match.group(1)) - ord("A")
+                            # print("result:", result)
+                            # print("=====")
+                            # print(content, result)
                         else:
                             result = 0
                     break  # If successful, break out of the loop
diff --git a/lmms_eval/tasks/mix_evals/video2text/utils.py b/lmms_eval/tasks/mix_evals/video2text/utils.py
index 4f7eaf8b..d98f9a81 100644
--- a/lmms_eval/tasks/mix_evals/video2text/utils.py
+++ b/lmms_eval/tasks/mix_evals/video2text/utils.py
@@ -382,16 +382,27 @@ def apply(self, resps, docs):
             for attempt in range(self.retries):
                 try:
                     # response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+                    # print(payload)
                     response = client.chat.completions.create(**payload)
+                    # print(response)
                     # response.raise_for_status()
 
                     # content =["choices"][0]["message"]["content"].strip()
                     content = response.choices[0].message.content
+                    print("content:", content)
                     if content:
-                        content = content.strip()
-                        match = re.search(r"r'\b([A-Z])\.?\b'", content)
+                        match = re.search(r"\[\[([A-Z])\]\]", content)
+                        # print("match:", match)
+                        if not match:
+                            match = re.search(r"r'\b([A-Z])\.?\b'", content)
+                            # print("match:", match)
                         if match:
+                            # print("=====")
+                            # print(match.group(1))
                             result = ord(match.group(1)) - ord("A")
+                            # print("result:", result)
+                            # print("=====")
+                            # print(content, result)
                         else:
                             result = 0
                     break  # If successful, break out of the loop

From aa5c5bf45dd1e6259dc46fc20a35e695308a2bb6 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Mon, 2 Dec 2024 00:51:54 +0800
Subject: [PATCH 08/10] llava vid default numframe too small

---
 lmms_eval/models/llava_vid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmms_eval/models/llava_vid.py b/lmms_eval/models/llava_vid.py
index fd3e9ae1..6c430325 100755
--- a/lmms_eval/models/llava_vid.py
+++ b/lmms_eval/models/llava_vid.py
@@ -90,7 +90,7 @@ def __init__(
         conv_template="vicuna_v1",
         use_cache=True,
         truncate_context=False,  # whether to truncate the context in generation, set it False for LLaVA-1.6
-        max_frames_num: int = 3,
+        max_frames_num: int = 20,
         video_fps: int = 1,
         mm_resampler_type: str = "spatial_pool",
         mm_spatial_pool_stride: int = 2,

From 11540d1d6078d5eed87d8f28eecae753f4ab8602 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Mon, 2 Dec 2024 13:06:16 +0800
Subject: [PATCH 09/10] add max_new_tokens

---
 .../mix_evals/image2text/mix_evals_image2text_freeform.yaml     | 2 +-
 .../image2text/mix_evals_image2text_freeform_hard.yaml          | 2 +-
 .../tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml     | 2 +-
 .../mix_evals/image2text/mix_evals_image2text_mc_hard.yaml      | 2 +-
 .../mix_evals/video2text/mix_evals_video2text_freeform.yaml     | 2 +-
 .../video2text/mix_evals_video2text_freeform_hard.yaml          | 2 +-
 .../tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml     | 2 +-
 .../mix_evals/video2text/mix_evals_video2text_mc_hard.yaml      | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform.yaml
index 1a21c46f..e1e7cded 100644
--- a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform.yaml
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform.yaml
@@ -12,6 +12,6 @@ metric_list:
     higher_is_better: true
 
 generation_kwargs:
-  max_new_tokens: 16
+  max_new_tokens: 1024
 
 include: _default_template_yaml
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform_hard.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform_hard.yaml
index e0aeea52..24874364 100644
--- a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform_hard.yaml
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_freeform_hard.yaml
@@ -12,7 +12,7 @@ metric_list:
     higher_is_better: true
 
 generation_kwargs:
-  max_new_tokens: 16
+  max_new_tokens: 1024
 
 include: _default_template_yaml
 
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml
index 4ac669b5..1100b539 100644
--- a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml
@@ -8,7 +8,7 @@ doc_to_text: !function utils.mix_evals_image2text_doc_to_text
 doc_to_target: "{{reference_answer}}"
 
 generation_kwargs:
-  max_new_tokens: 5
+  max_new_tokens: 1024
 
 metric_list:
   - metric: exact_match
diff --git a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc_hard.yaml b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc_hard.yaml
index 58fae82a..8fd90184 100644
--- a/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc_hard.yaml
+++ b/lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc_hard.yaml
@@ -8,7 +8,7 @@ doc_to_text: !function utils.mix_evals_image2text_doc_to_text
 doc_to_target: "{{reference_answer}}"
 
 generation_kwargs:
-  max_new_tokens: 5
+  max_new_tokens: 1024
 
 metric_list:
   - metric: exact_match
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
index 30a50046..366a2bea 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
@@ -12,7 +12,7 @@ metric_list:
     higher_is_better: true
 
 generation_kwargs:
-  max_new_tokens: 16
+  max_new_tokens: 1024
 
 include: _default_template_yaml
 
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform_hard.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform_hard.yaml
index 37690431..059d2b28 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform_hard.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform_hard.yaml
@@ -12,7 +12,7 @@ metric_list:
     higher_is_better: true
 
 generation_kwargs:
-  max_new_tokens: 16
+  max_new_tokens: 1024
 
 include: _default_template_yaml
 
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
index 1e36fab9..c94a0c5a 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml
@@ -8,7 +8,7 @@ doc_to_text: !function utils.mix_evals_video2text_doc_to_text
 doc_to_target: "{{reference_answer}}"
 
 generation_kwargs:
-  max_new_tokens: 5
+  max_new_tokens: 1024
 
 metric_list:
   - metric: exact_match
diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc_hard.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc_hard.yaml
index 97754a67..9cc3f2da 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc_hard.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc_hard.yaml
@@ -8,7 +8,7 @@ doc_to_text: !function utils.mix_evals_video2text_doc_to_text
 doc_to_target: "{{reference_answer}}"
 
 generation_kwargs:
-  max_new_tokens: 5
+  max_new_tokens: 1024
 
 metric_list:
   - metric: exact_match

From 57f8c43ea1f69182313385ec837588c637442ee0 Mon Sep 17 00:00:00 2001
From: Pu Fanyi <pufanyi@gmail.com>
Date: Mon, 2 Dec 2024 15:11:57 +0800
Subject: [PATCH 10/10] remove "with a short phrase" for `gpt4v`

---
 .../mix_evals/video2text/mix_evals_video2text_freeform.yaml     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
index 366a2bea..35531b0f 100644
--- a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
+++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml
@@ -21,5 +21,5 @@ lmms_eval_specific_kwargs:
     pre_prompt: "These are frames from a video. Please answer the following questions about the video."
     post_prompt: ""
   gpt4v:
-    pre_prompt: "These are frames from a video. Please answer the following questions about the video with a short phrase."
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
     post_prompt: ""