From 4a7c8f0166710ba37b588bd44115d8dbc8b7c699 Mon Sep 17 00:00:00 2001
From: KairuiHu <kairuih12@gmail.com>
Date: Thu, 19 Sep 2024 11:33:29 +0800
Subject: [PATCH] add videosearch adn mmmu_for_testing tasl

---
 check_missing.py                              |  20 +
 check_reverse.py                              |  33 ++
 lmms_eval/models/gpt4o.py                     | 238 ++++++++
 lmms_eval/models/gpt4v.py                     |  23 +-
 lmms_eval/tasks/mmmu/mmmu_val.yaml            |   3 +
 .../mmmu_for_testing/_default_template_yaml   |   6 +
 lmms_eval/tasks/mmmu_for_testing/mmmu.yaml    |   3 +
 .../mmmu_for_testing/mmmu_for_testing.yaml    |  17 +
 .../testing_combined_milestone.yaml           |  17 +
 lmms_eval/tasks/mmmu_for_testing/utils.py     | 549 ++++++++++++++++++
 .../tasks/mmmu_pro/mmmu_pro_original.yaml     |   2 +-
 .../tasks/videosearch/_default_template_yaml  |  12 +
 .../tasks/videosearch/combined_milestone.yaml |  16 +
 lmms_eval/tasks/videosearch/utils.py          | 511 ++++++++++++++++
 lmms_eval/tasks/videosearch/videosearch.yaml  |  16 +
 15 files changed, 1458 insertions(+), 8 deletions(-)
 create mode 100644 check_missing.py
 create mode 100644 check_reverse.py
 create mode 100755 lmms_eval/models/gpt4o.py
 create mode 100644 lmms_eval/tasks/mmmu_for_testing/_default_template_yaml
 create mode 100755 lmms_eval/tasks/mmmu_for_testing/mmmu.yaml
 create mode 100755 lmms_eval/tasks/mmmu_for_testing/mmmu_for_testing.yaml
 create mode 100755 lmms_eval/tasks/mmmu_for_testing/testing_combined_milestone.yaml
 create mode 100755 lmms_eval/tasks/mmmu_for_testing/utils.py
 create mode 100644 lmms_eval/tasks/videosearch/_default_template_yaml
 create mode 100755 lmms_eval/tasks/videosearch/combined_milestone.yaml
 create mode 100755 lmms_eval/tasks/videosearch/utils.py
 create mode 100755 lmms_eval/tasks/videosearch/videosearch.yaml

diff --git a/check_missing.py b/check_missing.py
new file mode 100644
index 00000000..c4462f8f
--- /dev/null
+++ b/check_missing.py
@@ -0,0 +1,20 @@
+from datasets import load_dataset, Dataset
+
+# Load the deduplicated VideoSearch dataset
+videosearch_dataset = load_dataset('lmms-lab/VideoSearch', 'deduplicated_combined_milestone', split='test')
+
+# ID to be removed
+id_to_remove = 'validation_Biology_18'
+
+# Filter out the row with the missing ID
+filtered_rows = [row for row in videosearch_dataset if row['id'] != id_to_remove]
+
+# Create a new dataset from the filtered rows
+filtered_dataset = Dataset.from_list(filtered_rows)
+
+# Save the filtered dataset locally or push it to Hugging Face hub
+filtered_dataset.push_to_hub("lmms-lab/VideoSearch", "final_combined_milestone", split="test")
+
+# Check and print the number of rows before and after filtering
+print(f"Original dataset size: {len(videosearch_dataset)}")
+print(f"Filtered dataset size: {len(filtered_dataset)}")
diff --git a/check_reverse.py b/check_reverse.py
new file mode 100644
index 00000000..32ff6f27
--- /dev/null
+++ b/check_reverse.py
@@ -0,0 +1,33 @@
+import os
+from datasets import load_dataset
+
+# Load the VideoSearch dataset
+videosearch_dataset = load_dataset('lmms-lab/VideoSearch', 'final_combined_milestone', split='test')
+
+# Path to the videos directory (replace with your actual path)
+videos_directory = '/mnt/sfs-common/krhu/.cache/huggingface/Combined_milestone/videos/'
+
+# Get all IDs from the dataset
+videosearch_ids = set(videosearch_dataset['id'])
+
+# List to store IDs of files that are not in the dataset
+extra_files = []
+
+# Loop through all .mp4 files in the videos directory
+for file in os.listdir(videos_directory):
+    if file.endswith('.mp4'):
+        # Extract the ID from the file name (remove the .mp4 extension)
+        file_id = file.replace('.mp4', '')
+        
+        # Check if the file ID exists in the VideoSearch dataset
+        if file_id not in videosearch_ids:
+            extra_files.append(file_id)
+
+# Print the IDs of .mp4 files that are not in the dataset
+if extra_files:
+    print(f"MP4 files not included in the VideoSearch dataset: {extra_files}")
+else:
+    print("All MP4 files have corresponding entries in the VideoSearch dataset.")
+
+# Optionally, print the total number of extra files
+print(f"Total extra MP4 files: {len(extra_files)}")
diff --git a/lmms_eval/models/gpt4o.py b/lmms_eval/models/gpt4o.py
new file mode 100755
index 00000000..d312e588
--- /dev/null
+++ b/lmms_eval/models/gpt4o.py
@@ -0,0 +1,238 @@
+import base64
+import json
+import os
+import time
+from copy import deepcopy
+from io import BytesIO
+from typing import List, Tuple
+
+import numpy as np
+import requests as url_requests
+from accelerate import Accelerator, DistributedType
+from tqdm import tqdm
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+
+try:
+    from decord import VideoReader, cpu
+except ImportError:
+    pass
+
+from PIL import Image
+
+# API_TYPE = os.getenv("API_TYPE", "openai")
+API_TYPE = "azure"
+NUM_SECONDS_TO_SLEEP = 30
+from loguru import logger as eval_logger
+
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    # API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_URL = "https://egoasr.openai.azure.com/openai/deployments/egodataset/chat/completions?api-version=2024-02-15-preview"
+    # API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    API_KEY = "5b496f4100d14ce58186e1bfe91db644"
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+
+
+@register_model("gpt4o")
+class GPT4O(lmms):
+    def __init__(
+        self,
+        # model_version: str = "gpt-4-vision-preview",
+        modality: str = "video",
+        max_frames_num: int = 10,
+        timeout: int = 120,
+        continual_mode: bool = False,
+        response_persistent_folder: str = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # Manually set a image token for GPT4V so that we can search for it
+        # and split the text and image
+        # Here we just use the same token as llava for convenient
+        # self.model_version = model_version
+        self.modality = modality
+        self.max_frames_num = max_frames_num
+        self.image_token = "<image>"
+        self.timeout = timeout
+        self.continual_mode = continual_mode
+        if self.continual_mode:
+            if response_persistent_folder is None:
+                raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
+
+            os.makedirs(response_persistent_folder, exist_ok=True)
+            self.response_persistent_folder = response_persistent_folder
+            self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
+
+            if os.path.exists(self.response_persistent_file):
+                with open(self.response_persistent_file, "r") as f:
+                    self.response_cache = json.load(f)
+                self.cache_mode = "resume"
+            else:
+                self.response_cache = {}
+                self.cache_mode = "start"
+
+        accelerator = Accelerator()
+        # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        else:
+            self.accelerator = accelerator
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+
+        self.device = self.accelerator.device
+
+    # Function to encode the image
+    def encode_image(self, image: Image):
+        output_buffer = BytesIO()
+        image.save(output_buffer, format="PNG")
+        byte_data = output_buffer.getvalue()
+        base64_str = base64.b64encode(byte_data).decode("utf-8")
+        return base64_str
+
+    # Function to encode the video
+    def encode_video(self, video_path, for_get_frames_num):
+        vr = VideoReader(video_path, ctx=cpu(0))
+        total_frame_num = len(vr)
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int)
+
+        # Ensure the last frame is included
+        if total_frame_num - 1 not in uniform_sampled_frames:
+            uniform_sampled_frames = np.append(uniform_sampled_frames, total_frame_num - 1)
+
+        frame_idx = uniform_sampled_frames.tolist()
+        frames = vr.get_batch(frame_idx).asnumpy()
+
+        base64_frames = []
+        for frame in frames:
+            img = Image.fromarray(frame)
+            output_buffer = BytesIO()
+            img.save(output_buffer, format="PNG")
+            byte_data = output_buffer.getvalue()
+            base64_str = base64.b64encode(byte_data).decode("utf-8")
+            base64_frames.append(base64_str)
+
+        return base64_frames
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def generate_until(self, requests) -> List[str]:
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            if self.continual_mode is True and self.cache_mode == "resume":
+                doc_uuid = f"{task}___{split}___{doc_id}"
+                if doc_uuid in self.response_cache:
+                    response_text = self.response_cache[doc_uuid]
+                    if response_text:
+                        res.append(response_text)
+                        pbar.update(1)
+                        continue
+
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
+            imgs = []  # multiple images or frames for video
+            for visual in visuals:
+                if self.modality == "image":
+                    img = self.encode_image(visual)
+                    imgs.append(img)
+                elif self.modality == "video":
+                    frames = self.encode_video(visual, self.max_frames_num)
+                    imgs.extend(frames)
+
+            payload = {"messages": []}
+            if API_TYPE == "openai":
+                payload["model"] = self.model_version
+
+            response_json = {"role": "user", "content": []}
+            # When there is no image token in the context, append the image to the text
+            if self.image_token not in contexts:
+                payload["messages"].append(deepcopy(response_json))
+                payload["messages"][0]["content"].append({"type": "text", "text": contexts})
+                for img in imgs:
+                    payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+            else:
+                contexts = contexts.split(self.image_token)
+                for idx, img in enumerate(imgs):
+                    payload["messages"].append(deepcopy(response_json))
+                    payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
+                    payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
+
+                # If n image tokens are in the contexts
+                # contexts will be splitted into n+1 chunks
+                # Manually add it into the payload
+                payload["messages"].append(deepcopy(response_json))
+                payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
+
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if gen_kwargs["max_new_tokens"] > 4096:
+                gen_kwargs["max_new_tokens"] = 4096
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+
+            payload["max_tokens"] = gen_kwargs["max_new_tokens"]
+            payload["temperature"] = gen_kwargs["temperature"]
+
+            for attempt in range(5):
+                try:
+                    response = url_requests.post(API_URL, headers=headers, json=payload, timeout=self.timeout)
+                    response_data = response.json()
+
+                    response_text = response_data["choices"][0]["message"]["content"].strip()
+                    break  # If successful, break out of the loop
+
+                except Exception as e:
+                    try:
+                        error_msg = response.json()
+                    except:
+                        error_msg = ""
+
+                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}.\nReponse: {error_msg}")
+                    if attempt <= 5:
+                        time.sleep(NUM_SECONDS_TO_SLEEP)
+                    else:  # If this was the last attempt, log and return empty string
+                        eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}.\nResponse: {response.json()}")
+                        response_text = ""
+            res.append(response_text)
+            pbar.update(1)
+
+            if self.continual_mode is True:  # Cache the response
+                doc_uuid = f"{task}___{split}___{doc_id}"
+                self.response_cache[doc_uuid] = response_text
+                with open(self.response_persistent_file, "w") as f:
+                    json.dump(self.response_cache, f)
+
+        pbar.close()
+        return res
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        # TODO
+        assert False, "GPT4V not support"
diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py
index aac62a87..2301f7b8 100755
--- a/lmms_eval/models/gpt4v.py
+++ b/lmms_eval/models/gpt4v.py
@@ -23,6 +23,7 @@
 from PIL import Image
 
 API_TYPE = os.getenv("API_TYPE", "openai")
+# API_TYPE = "azure"
 NUM_SECONDS_TO_SLEEP = 30
 from loguru import logger as eval_logger
 
@@ -35,7 +36,8 @@
     }
 elif API_TYPE == "azure":
     API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
-    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    # API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    API_KEY = "5b496f4100d14ce58186e1bfe91db644"
     headers = {
         "api-key": API_KEY,
         "Content-Type": "application/json",
@@ -46,9 +48,9 @@
 class GPT4V(lmms):
     def __init__(
         self,
-        model_version: str = "gpt-4-vision-preview",
+        #model_version: str = "gpt-4-vision-preview",
         modality: str = "video",
-        max_frames_num: int = 10,
+        max_frames_num: int = 32,
         timeout: int = 120,
         continual_mode: bool = False,
         response_persistent_folder: str = None,
@@ -58,7 +60,7 @@ def __init__(
         # Manually set a image token for GPT4V so that we can search for it
         # and split the text and image
         # Here we just use the same token as llava for convenient
-        self.model_version = model_version
+        #self.model_version = model_version
         self.modality = modality
         self.max_frames_num = max_frames_num
         self.image_token = "<image>"
@@ -157,8 +159,15 @@ def generate_until(self, requests) -> List[str]:
                     img = self.encode_image(visual)
                     imgs.append(img)
                 elif self.modality == "video":
-                    frames = self.encode_video(visual, self.max_frames_num)
-                    imgs.extend(frames)
+                    # frames = self.encode_video(visual, self.max_frames_num)
+                    # imgs.extend(frames)
+                    try:
+                        frames = self.encode_video(visual, self.max_frames_num)
+                        imgs.extend(frames)
+                    except Exception as e:
+                        # Log the error and skip to the next visual
+                        eval_logger.error(f"Error {e} in encoding video for {visual}")
+                        continue  # Skip this visual and continue with the others
 
             payload = {"messages": []}
             if API_TYPE == "openai":
@@ -185,7 +194,7 @@ def generate_until(self, requests) -> List[str]:
                 payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
 
             if "max_new_tokens" not in gen_kwargs:
-                gen_kwargs["max_new_tokens"] = 1024
+                gen_kwargs["max_new_tokens"] = 4096
             if gen_kwargs["max_new_tokens"] > 4096:
                 gen_kwargs["max_new_tokens"] = 4096
             if "temperature" not in gen_kwargs:
diff --git a/lmms_eval/tasks/mmmu/mmmu_val.yaml b/lmms_eval/tasks/mmmu/mmmu_val.yaml
index a301f7cb..e6879e33 100755
--- a/lmms_eval/tasks/mmmu/mmmu_val.yaml
+++ b/lmms_eval/tasks/mmmu/mmmu_val.yaml
@@ -12,5 +12,8 @@ metric_list:
   - metric: mmmu_acc
     aggregation: !function utils.mmmu_aggregate_results
     higher_is_better: true
+  - metric: submission
+    aggregation: !function utils.mmmu_test_aggregate_results_for_submission
+    higher_is_better: true
 
 include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu_for_testing/_default_template_yaml b/lmms_eval/tasks/mmmu_for_testing/_default_template_yaml
new file mode 100644
index 00000000..a5367534
--- /dev/null
+++ b/lmms_eval/tasks/mmmu_for_testing/_default_template_yaml
@@ -0,0 +1,6 @@
+generation_kwargs:
+  max_new_tokens: 16
+
+metadata:
+  version: 0.0
+  interleaved_format: false
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu_for_testing/mmmu.yaml b/lmms_eval/tasks/mmmu_for_testing/mmmu.yaml
new file mode 100755
index 00000000..28a9d000
--- /dev/null
+++ b/lmms_eval/tasks/mmmu_for_testing/mmmu.yaml
@@ -0,0 +1,3 @@
+group: mmmu
+task:
+- mmmu_testing_val
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu_for_testing/mmmu_for_testing.yaml b/lmms_eval/tasks/mmmu_for_testing/mmmu_for_testing.yaml
new file mode 100755
index 00000000..7c39e737
--- /dev/null
+++ b/lmms_eval/tasks/mmmu_for_testing/mmmu_for_testing.yaml
@@ -0,0 +1,17 @@
+dataset_path: lmms-lab/MMMU_for_testing
+dataset_name: "updated_first_milestone"
+task: "mmmu_testing_val"
+test_split: train
+output_type: generate_until
+doc_to_visual: !function utils.mmmu_doc_to_visual
+doc_to_text: !function utils.mmmu_doc_to_text
+doc_to_target: "answer"
+# The return value of process_results will be used by metrics
+process_results: !function utils.mmmu_process_results
+
+metric_list:
+  - metric: mmmu_acc
+    aggregation: !function utils.mmmu_aggregate_results
+    higher_is_better: true
+
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu_for_testing/testing_combined_milestone.yaml b/lmms_eval/tasks/mmmu_for_testing/testing_combined_milestone.yaml
new file mode 100755
index 00000000..90921fbc
--- /dev/null
+++ b/lmms_eval/tasks/mmmu_for_testing/testing_combined_milestone.yaml
@@ -0,0 +1,17 @@
+dataset_path: lmms-lab/MMMU_for_testing
+dataset_name: "combined_milestone"
+task: "mmmu_testing_combined_milestone"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.mmmu_doc_to_visual
+doc_to_text: !function utils.mmmu_doc_to_text
+doc_to_target: "answer"
+# The return value of process_results will be used by metrics
+process_results: !function utils.mmmu_process_results
+
+metric_list:
+  - metric: mmmu_acc
+    aggregation: !function utils.mmmu_aggregate_results
+    higher_is_better: true
+
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu_for_testing/utils.py b/lmms_eval/tasks/mmmu_for_testing/utils.py
new file mode 100755
index 00000000..f685bd9a
--- /dev/null
+++ b/lmms_eval/tasks/mmmu_for_testing/utils.py
@@ -0,0 +1,549 @@
+from collections import defaultdict
+import re
+import ast
+import random
+import numpy as np
+import os
+import json
+from pathlib import Path
+import yaml
+
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+
+from loguru import logger as eval_logger
+
+# MULTI_CHOICE_PROMPT = "Answer with the option's letter from the given choices directly. If you do not know the answer of a question, please generate 'Z' rather than randomly choose an option."
+# OPEN_ENDED_PROMPT = "Answer the question using a single word or phrase. If you do not know the answer of a question, please generate 'Z' rather than random guess the answer."
+
+with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+
+def replace_images_tokens(input_string):
+    for i in range(1, 8):
+        question_text = f"<image {i}>"
+        query_text = "<image>"
+        if question_text in input_string:
+            input_string = input_string.replace(question_text, query_text)
+    return input_string
+
+
+def parse_options(options):
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+    choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)])
+    # choices_str += "\nZ. I do not know"
+    return choices_str
+
+
+def construct_prompt(doc):
+    question = doc["question"]
+    if doc["question_type"] == "multiple-choice":
+        # Weirdly, data["options"] is a string in MMMU Huggingface dataset
+        parsed_options = parse_options(ast.literal_eval(doc["options"]))
+        # parsed_options already prepends a newline so no need to add space here
+        question = f"{question}\n{parsed_options}\n"
+    else:
+        question = f"{question}\n"
+    return question
+
+
+def mmmu_doc_to_text(doc):
+    question = construct_prompt(doc)
+    if config["metadata"]["interleaved_format"]:
+        question = replace_images_tokens(question)
+    return question
+
+
+def mmmu_doc_to_visual(doc):
+    prompt = construct_prompt(doc)
+    image_tokens = re.findall(r"<image \d+>", prompt)
+    # Remove <> and  swap space as _
+    image_tokens = sorted(list(set([image_token.strip("<>").replace(" ", "_") for image_token in image_tokens])))
+    visual = [doc[image_token].convert("RGB") for image_token in image_tokens]
+    return visual
+
+
+def mmmu_process_results(doc, results):
+    pred = results[0]
+    if doc["question_type"] == "multiple-choice":
+        index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["options"]))
+        parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+    else:
+        parsed_pred = parse_open_response(pred)
+    id = doc["id"]
+    mmmu_acc = {"id": id, "subdomain": extract_subset_name(doc["id"]), "question_type": doc["question_type"], "answer": doc["answer"], "parsed_pred": parsed_pred}
+    return {
+        "mmmu_acc": mmmu_acc,
+        "submission": {
+            id: pred,
+        },
+    }
+
+
+def extract_subset_name(input_string):
+    # Define a regex pattern to match "validation_" at the beginning and "_<number>" at the end
+    split = input_string.split("_")[0]
+    pattern = re.compile(rf"^{split}_(.+?)_\d+$")
+    match = pattern.search(input_string)
+    if match:
+        return match.group(1)
+    else:
+        raise ValueError(f'No match found in "{input_string}"')
+
+
+def mmmu_test_aggregate_results_for_submission(results, args):
+    path = generate_submission_file("mmmu_test_for_submission.json", args)
+    results_dict = {list(item.keys())[0]: list(item.values())[0] for item in results}
+    with open(path, "w") as f:
+        json.dump(results_dict, f)
+    eval_logger.info(f"Results saved to {path}.")
+
+
+def mmmu_aggregate_results(results):
+    evaluation_result = {}
+    subset_to_eval_samples = defaultdict(list)
+    for result in results:
+        subset_to_eval_samples[result["subdomain"]].append(result)
+    for subset, sub_eval_samples in subset_to_eval_samples.items():
+        judge_dict, metric_dict = evaluate_mmmu(sub_eval_samples)
+        metric_dict.update({"num_example": len(sub_eval_samples)})
+        evaluation_result[subset] = metric_dict
+    printable_results = {}
+    for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
+        in_domain_cat_results = {}
+        for cat_name in in_domain_cats:
+            if cat_name in evaluation_result.keys():
+                in_domain_cat_results[cat_name] = evaluation_result[cat_name]
+            else:
+                pass
+        in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
+        in_domain_data_num = sum([cat_results["num_example"] for cat_results in in_domain_cat_results.values()])
+        printable_results["Overall-" + domain] = {
+            "num": int(in_domain_data_num),
+            "acc": round(in_domain_ins_acc, 5),
+        }
+        # add sub category
+        for cat_name, cat_results in in_domain_cat_results.items():
+            printable_results[cat_name] = {
+                "num": int(cat_results["num_example"]),
+                "acc": round(cat_results["acc"], 5),
+            }
+    all_ins_acc = calculate_ins_level_acc(evaluation_result)
+    printable_results["Overall"] = {
+        "num": sum([cat_results["num_example"] for cat_results in evaluation_result.values()]),
+        "acc": round(all_ins_acc, 5),
+    }
+    print(printable_results)
+    return printable_results["Overall"]["acc"]
+
+
+##################
+# Helper functions written by official MMMU repo.
+##################
+
+
+def calculate_ins_level_acc(results):
+    """Calculate the instruction level accuracy for given Subject results
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L246
+    """
+    acc = 0
+    ins_num = 0
+    for cat_results in results.values():
+        acc += cat_results["acc"] * cat_results["num_example"]
+        ins_num += cat_results["num_example"]
+    if ins_num == 0:
+        return 0
+    return acc / ins_num
+
+
+DOMAIN_CAT2SUB_CAT = {
+    "Art and Design": ["Art", "Art_Theory", "Design", "Music"],
+    "Business": ["Accounting", "Economics", "Finance", "Manage", "Marketing"],
+    "Science": [
+        "Biology",
+        "Chemistry",
+        "Geography",
+        "Math",
+        "Physics",
+    ],
+    "Health and Medicine": [
+        "Basic_Medical_Science",
+        "Clinical_Medicine",
+        "Diagnostics_and_Laboratory_Medicine",
+        "Pharmacy",
+        "Public_Health",
+    ],
+    "Humanities and Social Science": [
+        "History",
+        "Literature",
+        "Sociology",
+        "Psychology",
+    ],
+    "Tech and Engineering": [
+        "Agriculture",
+        "Architecture_and_Engineering",
+        "Computer_Science",
+        "Electronics",
+        "Energy_and_Power",
+        "Materials",
+        "Mechanical_Engineering",
+    ],
+}
+
+
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L175
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+
+
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L191
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+
+
+def evaluate_mmmu(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L219
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample["answer"]
+        pred_i = sample["parsed_pred"]
+        if sample["question_type"] == "multiple-choice":
+            correct = eval_multi_choice(gold_i, pred_i)
+        else:  # open question
+            correct = eval_open(gold_i, pred_i)
+
+        if correct:
+            judge_dict[sample["id"]] = "Correct"
+            pred_correct += 1
+        else:
+            judge_dict[sample["id"]] = "Wrong"
+
+    if len(samples) == 0:
+        return {"acc": 0}
+    return judge_dict, {"acc": pred_correct / len(samples)}
+
+
+# def parse_multi_choice_response(response, all_choices, index2ans):
+#     """
+#     Parse the prediction from the generated response.
+#     Return the predicted index e.g., A, B, C, D.
+#     https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10
+#     """
+#     for char in [",", ".", "!", "?", ";", ":", "'"]:
+#         response = response.strip(char)
+#     response = " " + response + " "  # add space to avoid partial match
+
+#     index_ans = True
+#     ans_with_brack = False
+#     candidates = []
+#     for choice in all_choices:  # e.g., (A) (B) (C) (D)
+#         if f"({choice})" in response:
+#             candidates.append(choice)
+#             ans_with_brack = True
+
+#     if len(candidates) == 0:
+#         for choice in all_choices:  # e.g., A B C D
+#             if f"{choice} " in response:
+#                 candidates.append(choice)
+
+#     if len(candidates) == 0:
+#         for choice in all_choices:  # e.g., A. B. C. D.
+#             if f"{choice}." in response:
+#                 candidates.append(choice)
+
+#     # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+#     if len(candidates) == 0 and len(response.split()) > 5:
+#         for index, ans in index2ans.items():
+#             if ans.lower() in response.lower():
+#                 candidates.append(index)
+#                 index_ans = False  # it's content ans.
+
+#     if len(candidates) == 0:  # still not get answer, randomly choose one.
+#         pred_index = random.choice(all_choices)
+#     elif len(candidates) > 1:
+#         start_indexes = []
+#         if index_ans:
+#             if ans_with_brack:
+#                 for can in candidates:
+#                     index = response.rfind(f"({can})")
+#                     start_indexes.append(index)  # -1 will be ignored anyway
+#                 # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+#             else:
+#                 for can in candidates:
+#                     index = response.rfind(f" {can} ")
+#                     start_indexes.append(index)
+#         else:
+#             for can in candidates:
+#                 index = response.lower().rfind(index2ans[can].lower())
+#                 start_indexes.append(index)
+#         # get the last one
+#         pred_index = candidates[np.argmax(start_indexes)]
+#     else:  # if only one candidate, use it.
+#         pred_index = candidates[0]
+
+#     return pred_index
+
+
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D, or Z if the response includes an out-of-range choice.
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    
+    # # Check if response contains any invalid choice, e.g., Z
+    # for word in response.split():
+    #     if word not in all_choices and len(word) == 1 and word.isalpha():
+    #         return word  # If the model outputs something like 'Z', return it immediately
+
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f"{choice} " in response:
+                candidates.append(choice)
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A. B. C. D.
+            if f"{choice}." in response:
+                candidates.append(choice)
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L100
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbersz
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L65
+    """
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L76
+    """
+    # check if characters in the string
+
+    # if number, numerize it.
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        # lower it
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "]  # avoid trivial matches
+        return [string]
+
+
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L122
+    """
+
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
+        indicators_of_keys = [
+            "could be ",
+            "so ",
+            "is ",
+            "thus ",
+            "therefore ",
+            "final ",
+            "answer ",
+            "result ",
+        ]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [
+                    ":",
+                    ",",
+                    ".",
+                    "!",
+                    "?",
+                    ";",
+                    ":",
+                    "'",
+                ]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/data_utils.py#L54
+    """
+
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu_pro/mmmu_pro_original.yaml b/lmms_eval/tasks/mmmu_pro/mmmu_pro_original.yaml
index 0a5b8a5c..44b4cd73 100755
--- a/lmms_eval/tasks/mmmu_pro/mmmu_pro_original.yaml
+++ b/lmms_eval/tasks/mmmu_pro/mmmu_pro_original.yaml
@@ -1,6 +1,6 @@
 task: "mmmu_pro_original"
 dataset_path: MMMU/MMMU_Pro
-dataset_name: original
+dataset_name: standard
 test_split: test
 output_type: generate_until
 doc_to_visual: !function utils.mmmu_pro_doc_to_visual
diff --git a/lmms_eval/tasks/videosearch/_default_template_yaml b/lmms_eval/tasks/videosearch/_default_template_yaml
new file mode 100644
index 00000000..532edbf5
--- /dev/null
+++ b/lmms_eval/tasks/videosearch/_default_template_yaml
@@ -0,0 +1,12 @@
+dataset_path: lmms-lab/VideoSearch
+dataset_kwargs:
+  token: True
+  video: True
+  cache_dir: Combined_milestone
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "You should watch and learn the video content. Then apply what you learned to "
+    mcq_prompt: "answer the following multi-choice question. The image for this question is at the end of the video. \n"
+    open_ended_prompt: "answer the following open-ended question. The image for this question is at the end of the video. \n"
+    post_mcq_prompt: "Answer with the option's letter from the given choices directly. If you do not know the answer of a question, please generate 'Z' rather than randomly choose an option.\n"
+    post_open_ended_prompt: "Answer the question using a single word or phrase. If you do not know the answer of a question, please generate 'Z' rather than random guess the answer.\n"
\ No newline at end of file
diff --git a/lmms_eval/tasks/videosearch/combined_milestone.yaml b/lmms_eval/tasks/videosearch/combined_milestone.yaml
new file mode 100755
index 00000000..3533a4f6
--- /dev/null
+++ b/lmms_eval/tasks/videosearch/combined_milestone.yaml
@@ -0,0 +1,16 @@
+dataset_name: "final_combined_milestone"
+task: "combined_milestone"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videosearch_doc_to_visual
+doc_to_text: !function utils.videosearch_doc_to_text
+doc_to_target: !function utils.videosearch_doc_to_answer
+process_results: !function utils.videosearch_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.videosearch_aggregate_results_for_submission
+    higher_is_better: true
+  - metric: mmmu_acc
+    aggregation: !function utils.videosearch_aggregate_results
+    higher_is_better: true
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/videosearch/utils.py b/lmms_eval/tasks/videosearch/utils.py
new file mode 100755
index 00000000..f6f325b2
--- /dev/null
+++ b/lmms_eval/tasks/videosearch/utils.py
@@ -0,0 +1,511 @@
+from decord import VideoReader, cpu
+from collections import defaultdict
+import numpy as np
+import os
+import sys
+import datetime
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+import json
+import ast
+import re
+
+import yaml
+import random
+from pathlib import Path
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+
+with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+# We will unzip all the zip files
+# To HF HOME cache dir
+# And load it here
+HF_HOME = os.environ["HF_HOME"]
+cache_dir = config["dataset_kwargs"]["cache_dir"]
+cache_dir = os.path.join(HF_HOME, cache_dir)
+cache_dir = os.path.join(cache_dir, "videos")
+
+from loguru import logger as eval_logger
+
+
+# Pass in video path here
+# Can only work correctly with video llm
+def videosearch_doc_to_visual(doc):
+    video_path = doc["id"] + ".mp4"
+    video_path = os.path.join(cache_dir, video_path)
+    alter_path = os.path.join(cache_dir, "validation_")
+    if os.path.exists(video_path):
+        video_path = video_path
+    elif os.path.exists(video_path.replace("mp4", "MP4")):
+        video_path = video_path.replace("mp4", "MP4")
+    else:
+        sys.exit(f"video path:{video_path} does not exist, please check")
+    return [video_path]
+
+
+# This is the place where you format your question
+def videosearch_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+    pre_prompt = ""
+    post_prompt = ""
+    
+    pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
+    question = doc["question"]
+    
+    if doc["question_type"] == "multiple-choice":
+        pre_prompt += lmms_eval_specific_kwargs["mcq_prompt"]
+        post_prompt = lmms_eval_specific_kwargs["post_mcq_prompt"]
+        parsed_options = parse_options(ast.literal_eval(doc["options"]))
+        question += "\n" + parsed_options 
+    else:
+        pre_prompt += lmms_eval_specific_kwargs["open_ended_prompt"]
+        post_prompt = lmms_eval_specific_kwargs["post_open_ended_prompt"] 
+        
+    # print(f"{pre_prompt}{question}{post_prompt}")
+    return f"{pre_prompt}{question}{post_prompt}" 
+
+
+def parse_options(options):
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+    choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)])
+    choices_str += "\nZ. I do not know\n\n"
+    return choices_str
+
+
+def videosearch_doc_to_answer(doc):
+    return doc["answer"]
+
+
+# process results for each individual instance
+def videosearch_process_results(doc, results):
+    pred = results[0]
+    if doc["question_type"] == "multiple-choice":
+        index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["options"]))
+        parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+    else:
+        parsed_pred = parse_open_response(pred)
+    id = doc["id"]
+    mmmu_acc = {"id": id, "subdomain": extract_subset_name(doc["id"]), "question_type": doc["question_type"], "answer": doc["answer"], "parsed_pred": parsed_pred}
+    return {
+        "mmmu_acc": mmmu_acc,
+        "submission": {
+            id: pred,
+        },
+    }
+
+
+# return subset name
+def extract_subset_name(input_string):
+    # Define a regex pattern to match "validation_" at the beginning and "_<number>" at the end
+    split = input_string.split("_")[0]
+    pattern = re.compile(rf"^{split}_(.+?)_\d+$")
+    match = pattern.search(input_string)
+    if match:
+        return match.group(1)
+    else:
+        raise ValueError(f'No match found in "{input_string}"')
+
+
+def videosearch_aggregate_results_for_submission(results, args):
+    path = generate_submission_file("mmmu_for_submission.json", args)
+    results_dict = {list(item.keys())[0]: list(item.values())[0] for item in results}
+    with open(path, "w") as f:
+        json.dump(results_dict, f)
+    eval_logger.info(f"Results saved to {path}.")
+
+
+def videosearch_aggregate_results(results):
+    evaluation_result = {}
+    subset_to_eval_samples = defaultdict(list)
+    for result in results:
+        subset_to_eval_samples[result["subdomain"]].append(result)
+    for subset, sub_eval_samples in subset_to_eval_samples.items():
+        print(sub_eval_samples)
+        judge_dict, metric_dict = evaluate_mmmu(sub_eval_samples)
+        metric_dict.update({"num_example": len(sub_eval_samples)})
+        evaluation_result[subset] = metric_dict
+    printable_results = {}
+    for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
+        in_domain_cat_results = {}
+        for cat_name in in_domain_cats:
+            if cat_name in evaluation_result.keys():
+                in_domain_cat_results[cat_name] = evaluation_result[cat_name]
+            else:
+                pass
+        in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
+        in_domain_data_num = sum([cat_results["num_example"] for cat_results in in_domain_cat_results.values()])
+        printable_results["Overall-" + domain] = {
+            "num": int(in_domain_data_num),
+            "acc": round(in_domain_ins_acc, 5),
+        }
+        # add sub category
+        for cat_name, cat_results in in_domain_cat_results.items():
+            printable_results[cat_name] = {
+                "num": int(cat_results["num_example"]),
+                "acc": round(cat_results["acc"], 5),
+            }
+    all_ins_acc = calculate_ins_level_acc(evaluation_result)
+    printable_results["Overall"] = {
+        "num": sum([cat_results["num_example"] for cat_results in evaluation_result.values()]),
+        "acc": round(all_ins_acc, 5),
+    }
+    print(printable_results)
+    return printable_results["Overall"]["acc"]
+
+
+##################
+# Helper functions written by official MMMU repo.
+##################
+
+
+def calculate_ins_level_acc(results):
+    """Calculate the instruction level accuracy for given Subject results
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L246
+    """
+    acc = 0
+    ins_num = 0
+    for cat_results in results.values():
+        acc += cat_results["acc"] * cat_results["num_example"]
+        ins_num += cat_results["num_example"]
+    if ins_num == 0:
+        return 0
+    return acc / ins_num
+
+
+DOMAIN_CAT2SUB_CAT = {
+    "Art and Design": 
+        ["Art", "Art_Theory", "Design", "Music"],
+    "Business": 
+        ["Accounting", "Economics", "Finance", "Manage", "Marketing"],
+    "Science": [
+        "Biology",
+        "Chemistry",
+        "Geography",
+        "Math",
+        "Physics",
+    ],
+    "Health and Medicine": [
+        "Basic_Medical_Science",
+        "Clinical_Medicine",
+        "Diagnostics_and_Laboratory_Medicine",
+        "Pharmacy",
+        "Public_Health",
+    ],
+    "Humanities and Social Science": [
+        "History",
+        "Literature",
+        "Sociology",
+        "Psychology",
+    ],
+    "Tech and Engineering": [
+        "Agriculture",
+        "Architecture_and_Engineering",
+        "Computer_Science",
+        "Electronics",
+        "Energy_and_Power",
+        "Materials",
+        "Mechanical_Engineering",
+    ],
+}
+
+
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L175
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    # print(gold_i)
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+
+
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L191
+    """
+    # print(gold_i)
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+
+
+def evaluate_mmmu(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L219
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample["answer"]
+        pred_i = sample["parsed_pred"]
+        if sample["question_type"] == "multiple-choice":
+            correct = eval_multi_choice(gold_i, pred_i)
+        else:  # open question
+            correct = eval_open(gold_i, pred_i)
+
+        if correct:
+            judge_dict[sample["id"]] = "Correct"
+            pred_correct += 1
+        else:
+            judge_dict[sample["id"]] = "Wrong"
+
+    if len(samples) == 0:
+        return {"acc": 0}
+    return judge_dict, {"acc": pred_correct / len(samples)}
+
+
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    
+    # Check if response contains any invalid choice, e.g., Z
+    for word in response.split():
+        if word not in all_choices and len(word) == 1 and word.isalpha():
+            return word  # If the model outputs something like 'Z', return it immediately
+    
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f"{choice} " in response:
+                candidates.append(choice)
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A. B. C. D.
+            if f"{choice}." in response:
+                candidates.append(choice)
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L100
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbersz
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L65
+    """
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L76
+    """
+    # check if characters in the string
+
+    # if number, numerize it.
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        # lower it
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "]  # avoid trivial matches
+        return [string]
+
+
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L122
+    """
+
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
+        indicators_of_keys = [
+            "could be ",
+            "so ",
+            "is ",
+            "thus ",
+            "therefore ",
+            "final ",
+            "answer ",
+            "result ",
+        ]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [
+                    ":",
+                    ",",
+                    ".",
+                    "!",
+                    "?",
+                    ";",
+                    ":",
+                    "'",
+                ]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/data_utils.py#L54
+    """
+
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
diff --git a/lmms_eval/tasks/videosearch/videosearch.yaml b/lmms_eval/tasks/videosearch/videosearch.yaml
new file mode 100755
index 00000000..352b4ba5
--- /dev/null
+++ b/lmms_eval/tasks/videosearch/videosearch.yaml
@@ -0,0 +1,16 @@
+dataset_name: "first_milestone_updated"
+task: "first_milestone_updated"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videosearch_doc_to_visual
+doc_to_text: !function utils.videosearch_doc_to_text
+doc_to_target: !function utils.videosearch_doc_to_answer
+process_results: !function utils.videosearch_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.videosearch_aggregate_results_for_submission
+    higher_is_better: true
+  - metric: mmmu_acc
+    aggregation: !function utils.videosearch_aggregate_results
+    higher_is_better: true
+include: _default_template_yaml
\ No newline at end of file