diff --git a/lmms_eval/models/llava_vid.py b/lmms_eval/models/llava_vid.py
index 1da10083..ebffb6dc 100755
--- a/lmms_eval/models/llava_vid.py
+++ b/lmms_eval/models/llava_vid.py
@@ -32,6 +32,7 @@
 
 try:
     from llavavid.model.language_model.llava_qwen import LlavaQwenConfig
+
     AutoConfig.register("llava_qwen", LlavaQwenConfig)
 except:
     eval_logger.debug("No llava vid qwen yet for llavavid")
@@ -346,7 +347,7 @@ def generate_until(self, requests) -> List[str]:
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
             if "temperature" not in gen_kwargs:
-                gen_kwargs["temperature"] = 0
+                gen_kwargs["temperature"] = 0.2
             if "top_p" not in gen_kwargs:
                 gen_kwargs["top_p"] = None
             if "num_beams" not in gen_kwargs:
diff --git a/lmms_eval/tasks/cvrr/cvrr_fine_grained_action_understanding.yaml b/lmms_eval/tasks/cvrr/cvrr_fine_grained_action_understanding.yaml
new file mode 100755
index 00000000..a1cc069c
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_fine_grained_action_understanding.yaml
@@ -0,0 +1,14 @@
+dataset_name: "fine_grained_action_understanding"
+task: "cvrr_fine_grained_action_understanding"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim2
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_interpretation_of_social_context.yaml b/lmms_eval/tasks/cvrr/cvrr_interpretation_of_social_context.yaml
new file mode 100755
index 00000000..cf8cbe7f
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_interpretation_of_social_context.yaml
@@ -0,0 +1,14 @@
+dataset_name: "interpretation_of_social_context"
+task: "cvrr_interpretation_of_social_context"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim3
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_interpretation_of_visual_context.yaml b/lmms_eval/tasks/cvrr/cvrr_interpretation_of_visual_context.yaml
new file mode 100755
index 00000000..5cc10686
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_interpretation_of_visual_context.yaml
@@ -0,0 +1,14 @@
+dataset_name: "interpretation_of_visual_context"
+task: "cvrr_interpretation_of_visual_context"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim4
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_multiple_actions_in_a_single_video.yaml b/lmms_eval/tasks/cvrr/cvrr_multiple_actions_in_a_single_video.yaml
new file mode 100755
index 00000000..12f861a3
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_multiple_actions_in_a_single_video.yaml
@@ -0,0 +1,14 @@
+dataset_name: "multiple_actions_in_a_single_video"
+task: "cvrr_multiple_actions_in_a_single_video"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim5
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_non_existent_actions_with_existent_scene_depictions.yaml b/lmms_eval/tasks/cvrr/cvrr_non_existent_actions_with_existent_scene_depictions.yaml
new file mode 100755
index 00000000..a2da9410
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_non_existent_actions_with_existent_scene_depictions.yaml
@@ -0,0 +1,14 @@
+dataset_name: "non_existent_actions_with_existent_scene_depictions"
+task: "cvrr_non_existent_actions_with_existent_scene_depictions"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim6
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_non_existent_actions_with_non_existent_scene_depictions.yaml b/lmms_eval/tasks/cvrr/cvrr_non_existent_actions_with_non_existent_scene_depictions.yaml
new file mode 100755
index 00000000..b77dad4e
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_non_existent_actions_with_non_existent_scene_depictions.yaml
@@ -0,0 +1,14 @@
+dataset_name: "non_existent_actions_with_non_existent_scene_depictions"
+task: "cvrr_non_existent_actions_with_non_existent_scene_depictions"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim7
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml b/lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml
index fc85014a..d469f619 100755
--- a/lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml
+++ b/lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml
@@ -8,7 +8,7 @@ doc_to_target: !function utils.cvrr_doc_to_answer
 process_results: !function utils.cvrr_process_results
 metric_list:
   - metric: submission
-    aggregation: !function utils.cvrr_aggregate_results
+    aggregation: !function utils.cvrr_aggregate_results_dim1
     higher_is_better: true
 include: _default_template_yaml
 
diff --git a/lmms_eval/tasks/cvrr/cvrr_partial_actions.yaml b/lmms_eval/tasks/cvrr/cvrr_partial_actions.yaml
new file mode 100755
index 00000000..6379225e
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_partial_actions.yaml
@@ -0,0 +1,14 @@
+dataset_name: "partial_actions"
+task: "cvrr_partial_actions"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim8
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_time_order_understanding.yaml b/lmms_eval/tasks/cvrr/cvrr_time_order_understanding.yaml
new file mode 100755
index 00000000..7d330fcd
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_time_order_understanding.yaml
@@ -0,0 +1,14 @@
+dataset_name: "time_order_understanding"
+task: "cvrr_time_order_understanding"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim9
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_understanding_emotional_context.yaml b/lmms_eval/tasks/cvrr/cvrr_understanding_emotional_context.yaml
new file mode 100755
index 00000000..f5128acb
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_understanding_emotional_context.yaml
@@ -0,0 +1,14 @@
+dataset_name: "understanding_emotional_context"
+task: "cvrr_understanding_emotional_context"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim10
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/cvrr_unusual_and_physically_anomalous_activities.yaml b/lmms_eval/tasks/cvrr/cvrr_unusual_and_physically_anomalous_activities.yaml
new file mode 100755
index 00000000..92555a29
--- /dev/null
+++ b/lmms_eval/tasks/cvrr/cvrr_unusual_and_physically_anomalous_activities.yaml
@@ -0,0 +1,14 @@
+dataset_name: "unusual_and_physically_anomalous_activities"
+task: "cvrr_unusual_and_physically_anomalous_activities"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.cvrr_doc_to_visual
+doc_to_text: !function utils.cvrr_doc_to_text
+doc_to_target: !function utils.cvrr_doc_to_answer
+process_results: !function utils.cvrr_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cvrr_aggregate_results_dim11
+    higher_is_better: true
+include: _default_template_yaml
+
diff --git a/lmms_eval/tasks/cvrr/utils.py b/lmms_eval/tasks/cvrr/utils.py
index 0e300d4f..01dfc54e 100755
--- a/lmms_eval/tasks/cvrr/utils.py
+++ b/lmms_eval/tasks/cvrr/utils.py
@@ -41,25 +41,47 @@
         "Content-Type": "application/json",
     }
 
-# Unzip all the zip files to HF HOME cache dir
-HF_HOME = os.environ["HF_HOME"]
-cache_dir = config["dataset_kwargs"]["cache_dir"]
-cache_dir = os.path.join(HF_HOME, cache_dir)
-cache_dir = os.path.join(cache_dir, "CVRR-ES/continuity_and_object_instance_count")
-
 
 # Pass in video path here
 # Can only work correctly with video llm
 def cvrr_doc_to_visual(doc):
-    video_path = doc["VideoID"]
+
+    # Unzip all the zip files to HF HOME cache dir
+    HF_HOME = os.environ["HF_HOME"]
+    cache_dir = config["dataset_kwargs"]["cache_dir"]
+    cache_dir = os.path.join(HF_HOME, cache_dir)
+    cache_dir = os.path.join(cache_dir, "CVRR-ES")
 
     if doc["DimensionName"] == "Continuity and Object Instance Count":
+        cache_dir = os.path.join(cache_dir, "continuity_and_object_instance_count")
+    elif doc["DimensionName"] == "Fine-grained action understanding":
+        cache_dir = os.path.join(cache_dir, "fine_grained_action_understanding")
+    elif doc["DimensionName"] == "Interpretation of social context":
+        cache_dir = os.path.join(cache_dir, "interpretation_of_social_context")
+    elif doc["DimensionName"] == "Interpretation of visual context":
+        cache_dir = os.path.join(cache_dir, "interpretation_of_visual_context")
+    elif doc["DimensionName"] == "Multiple actions in a single video":
+        cache_dir = os.path.join(cache_dir, "multiple_actions_in_a_single_video")
+    elif doc["DimensionName"] == "Non-existent actions with existent scene depictions":
+        cache_dir = os.path.join(cache_dir, "non_existent_actions_with_existent_scene_depictions")
+    elif doc["DimensionName"] == "Non-existent actions with non-existent scene depictions":
+        cache_dir = os.path.join(cache_dir, "non_existent_actions_with_non_existent_scene_depictions")
+    elif doc["DimensionName"] == "Partial actions":
+        cache_dir = os.path.join(cache_dir, "partial_actions")
+    elif doc["DimensionName"] == "Time order understanding":
+        cache_dir = os.path.join(cache_dir, "time_order_understanding")
+    elif doc["DimensionName"] == "Understanding of emotional context":
+        cache_dir = os.path.join(cache_dir, "understanding_emotional_context")
+    elif doc["DimensionName"] == "Unusual and Physically Anomalous activities":
+        cache_dir = os.path.join(cache_dir, "unusual_and_physically_anomalous_activities")
+
+    video_path = doc["VideoID"]
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(video_path):
+        video_path = video_path
+    else:
+        sys.exit(f"video path:{video_path} does not exist, please check")
 
-        video_path = os.path.join(cache_dir, video_path)
-        if os.path.exists(video_path):
-            video_path = video_path
-        else:
-            sys.exit(f"video path:{video_path} does not exist, please check")
     return [video_path]
 
 
@@ -109,34 +131,31 @@ def cvrr_aggregate_submissions(results, args, task):
     return path
 
 
-def get_eval(question, answer, pred, task, max_tokens: int, retries: int = 5):
+def get_eval(question, answer, pred, max_tokens: int, retries: int = 5):
     global headers
 
-    if task == "continuity_and_object_instance_count":
-        messages = [
-            {
-                "role": "system",
-                "content": "You are an intelligent chatbot designed for evaluating the correctness of AI assistant predictions for question-answer pairs. "
-                "Your task is to compare the predicted answer with the ground-truth answer and determine if the predicted answer is correct or not. Here's how you can accomplish the task:"
-                "------"
-                "##INSTRUCTIONS: "
-                "- Focus on the correctness and accuracy of the predicted answer with the ground-truth.\n"
-                "- Consider predictions with less specific details as correct evaluation, unless such details are explicitly asked in the question.\n",
-            },
-            {
-                "role": "user",
-                "content": "Please evaluate the following video-based question-answer pair:\n\n"
-                f"Question: {question}\n"
-                f"Ground truth correct Answer: {answer}\n"
-                f"Predicted Answer: {pred}\n\n"
-                "Provide your evaluation as a correct/incorrect prediction along with the score where the score is an integer value between 0 (fully wrong) and 5 (fully correct). The middle score provides the percentage of correctness."
-                "Please generate the response in the form of a Python dictionary string with keys 'pred', 'score' and 'reason', where value of 'pred' is  a string of 'correct' or 'incorrect', value of 'score' is in INTEGER, not STRING and value of 'reason' should provide the reason behind the decision."
-                "Only provide the Python dictionary string."
-                "For example, your response should look like this: {'pred': 'correct', 'score': 4.8, 'reason': reason}.",
-            },
-        ]
-
-    print(messages)
+    messages = [
+        {
+            "role": "system",
+            "content": "You are an intelligent chatbot designed for evaluating the correctness of AI assistant predictions for question-answer pairs. "
+            "Your task is to compare the predicted answer with the ground-truth answer and determine if the predicted answer is correct or not. Here's how you can accomplish the task:"
+            "------"
+            "##INSTRUCTIONS: "
+            "- Focus on the correctness and accuracy of the predicted answer with the ground-truth.\n"
+            "- Consider predictions with less specific details as correct evaluation, unless such details are explicitly asked in the question.\n",
+        },
+        {
+            "role": "user",
+            "content": "Please evaluate the following video-based question-answer pair:\n\n"
+            f"Question: {question}\n"
+            f"Ground truth correct Answer: {answer}\n"
+            f"Predicted Answer: {pred}\n\n"
+            "Provide your evaluation as a correct/incorrect prediction along with the score where the score is an integer value between 0 (fully wrong) and 5 (fully correct). The middle score provides the percentage of correctness."
+            "Please generate the response in the form of a Python dictionary string with keys 'pred', 'score' and 'reason', where value of 'pred' is  a string of 'correct' or 'incorrect', value of 'score' is in INTEGER, not STRING and value of 'reason' should provide the reason behind the decision."
+            "Only provide the Python dictionary string."
+            'For example, your response should look like this: {"pred": "correct", "score": 4.8, "reason": reason}.',
+        },
+    ]
 
     payload = {
         "model": GPT_EVAL_MODEL_NAME,
@@ -179,6 +198,7 @@ def get_eval(question, answer, pred, task, max_tokens: int, retries: int = 5):
 def parse_score(review):
     try:
         # Convert the string representation of a dictionary to an actual dictionary
+        # Escape single quotes inside the dictionary string to prevent parsing errors
         review_dict = ast.literal_eval(review)
         correctness = review_dict.get("pred", "incorrect")
         score = review_dict.get("score", 0)
@@ -203,38 +223,28 @@ def cvrr_print_scores(eval_file_path, args):
     score_file_name = "scores.json"
     path = file_utils.generate_submission_file(score_file_name, args)
 
-    # Compute average score and final accuracy
-    # Initialize counters
-    yes_count = 0
-    no_count = 0
+    # Compute average score
     total_score = 0
 
-    # Iterate over the results to count correctness and sum scores
+    # Iterate over the results to sum scores
     for result_list in evaluated_list:
         eval_dict = result_list[0]
         total_score += eval_dict["score"]
 
-        if eval_dict["Correctness"] == "yes":
-            yes_count += 1
-        else:
-            no_count += 1
-
     # Calculate accuracy and average score
-    accuracy = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0
     average_score = total_score / len(evaluated_list) if evaluated_list else 0
 
     # Print the results
-    print(f"Accuracy: {accuracy}")
     print(f"Average Score: {average_score}")
 
     # Write the processed data to the scores file
     with open(path, "w") as f:
-        json.dump({"accuracy": accuracy, "average_score": average_score}, f, indent=4)
+        json.dump({"average_score": average_score}, f, indent=4)
 
     eval_logger.info(f"Score file saved to {path}")
 
 
-def cvrr_gpt_eval(result_file_path, args, task):
+def cvrr_gpt_eval(result_file_path, args):
     """
     Process the result file containing predictions, score them using GPT,
     and save the results with added scores and correctness fields to a new file.
@@ -264,7 +274,7 @@ def cvrr_gpt_eval(result_file_path, args, task):
             pred = data_dict.get("pred", "")
 
             # Assume get_eval returns a review and the model name, and parse_score parses this review
-            review, model_name = get_eval(question, answer, pred, task, 64)
+            review, model_name = get_eval(question, answer, pred, 512)
             correctness, score, reason = parse_score(review)
         except Exception as e:
             eval_logger.error(f"Error for Video Name: {data_dict.get('VideoID', 'Unknown')}: {e}")
@@ -295,7 +305,67 @@ def cvrr_gpt_eval(result_file_path, args, task):
     return eval_file_path
 
 
-def cvrr_aggregate_results(results, args):
+def cvrr_aggregate_results_dim1(results, args):
     result_file_path = cvrr_aggregate_submissions(results, args, "continuity_and_object_instance_count")
-    eval_file_path = cvrr_gpt_eval(result_file_path, args, "continuity_and_object_instance_count")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim2(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "fine_grained_action_understanding")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim3(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "interpretation_of_social_context")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim4(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "interpretation_of_visual_context")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim5(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "multiple_actions_in_a_single_video")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim6(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "non_existent_actions_with_existent_scene_depictions")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim7(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "non_existent_actions_with_non_existent_scene_depictions")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim8(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "partial_actions")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim9(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "time_order_understanding")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim10(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "understanding_emotional_context")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
+    cvrr_print_scores(eval_file_path, args)
+
+
+def cvrr_aggregate_results_dim11(results, args):
+    result_file_path = cvrr_aggregate_submissions(results, args, "unusual_and_physically_anomalous_activities")
+    eval_file_path = cvrr_gpt_eval(result_file_path, args)
     cvrr_print_scores(eval_file_path, args)
diff --git a/tools/make_activitynetqa.ipynb b/tools/make_activitynetqa.ipynb
deleted file mode 100755
index efbef00e..00000000
--- a/tools/make_activitynetqa.ipynb
+++ /dev/null
@@ -1,120 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# This notebook will guide you to make correct format of Huggingface dataset, in proper parquet format and visualizable in Huggingface dataset hub.\n",
-    "# We will take the example of the dataset \"Otter-AI/MMVet\" and convert it to the proper format."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "data_path = \"Otter-AI/MMVet\"\n",
-    "df = load_dataset(data_path, split=\"test\").to_pandas()\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import Dataset, Features, Value, Image\n",
-    "import pandas as pd\n",
-    "\n",
-    "# Define the features for the dataset\n",
-    "features = Features(\n",
-    "    {\n",
-    "        \"question_id\": Value(dtype=\"string\"),\n",
-    "        \"image\": Image(),\n",
-    "        \"question\": Value(dtype=\"string\"),\n",
-    "        \"answer\": Value(dtype=\"string\"),\n",
-    "        \"image_source\": Value(dtype=\"string\"),\n",
-    "        \"capability\": Value(dtype=\"string\"),\n",
-    "        # Add other fields as necessary\n",
-    "    }\n",
-    ")\n",
-    "\n",
-    "df_items = {\n",
-    "    \"question_id\": [],\n",
-    "    \"image\": [],\n",
-    "    \"question\": [],\n",
-    "    \"answer\": [],\n",
-    "    \"image_source\": [],\n",
-    "    \"capability\": [],\n",
-    "}\n",
-    "\n",
-    "for idx, row in df.iterrows():\n",
-    "    df_items[\"question_id\"].append(str(row[\"id\"]))\n",
-    "    image = {\"bytes\": row[\"images\"][0][\"bytes\"], \"path\": \"\"}\n",
-    "    df_items[\"image\"].append(image)\n",
-    "    df_items[\"question\"].append(str(row[\"instruction\"]))\n",
-    "    df_items[\"answer\"].append(str(row[\"answer\"]))\n",
-    "    df_items[\"image_source\"].append(str(row[\"image_source\"]))\n",
-    "    df_items[\"capability\"].append(\",\".join(list(row[\"capability\"])))\n",
-    "    # Add other fields as necessary\n",
-    "\n",
-    "df_items = pd.DataFrame(df_items)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_items.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = Dataset.from_pandas(df_items, features=features)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hub_dataset_path = \"lmms-lab/MMVet\"\n",
-    "dataset.push_to_hub(repo_id=hub_dataset_path, split=\"test\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "lmms-eval",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.18"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}