diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 5560745c..f488bb22 100755
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -67,8 +67,10 @@ class TaskConfig(dict):
     validation_split: str = None
     test_split: str = None
     fewshot_split: str = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    full_docs: bool = False
     # formatting / prompting options.
     # see docs/advanced_task_guide.md for more info
+    process_results_use_image: bool = False
     process_docs: Callable = None
     doc_to_visual: Union[Callable, str] = None
     doc_to_text: Union[Callable, str] = None
diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
index 48397a0a..0e890533 100755
--- a/lmms_eval/evaluator.py
+++ b/lmms_eval/evaluator.py
@@ -327,7 +327,7 @@ def evaluate(
             # hack: remove image columns to speed avoid loading images and speed up postprocessing
             # reason: doc_iterator will actually load image if it's in the doc.
             docs = task.test_docs() if task.has_test_docs() else task.validation_docs()
-            if "d170" not in task_name and "dc100" not in task_name and "dc200" not in task_name and "llava_wilder" not in task_name and "live_bench" not in task_name and "wildvision" not in task_name:
+            if not task.config["process_results_use_image"]:
                 remove_cols = []
                 features = docs.features
                 # If it is an Image instance or a Sequence of Image instance. Remove it
@@ -340,10 +340,7 @@ def evaluate(
                     docs = docs.remove_columns(remove_cols)
 
             ####################### Processing with Full Docs Mode #######################
-            if task_name in ["videochatgpt_consistency"]:
-                full_docs = True
-            else:
-                full_docs = False
+            full_docs = task.config["full_docs"]
 
             doc_iterator = itertools.islice(enumerate(docs), lm.rank, limit, lm.world_size)
             # Instead of converting the iterator to a list, use `itertools.tee` to create a parallel iterator for counting
diff --git a/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml b/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml
index cd692a1d..5281ad1d 100755
--- a/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml
+++ b/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml
@@ -1,4 +1,5 @@
 model_specific_prompt_kwargs:
   default:
     pre_prompt: ""
-    post_prompt: ""
\ No newline at end of file
+    post_prompt: ""
+process_results_use_image: true
diff --git a/lmms_eval/tasks/live_bench/live_bench.yaml b/lmms_eval/tasks/live_bench/live_bench.yaml
old mode 100644
new mode 100755
index b4a09853..1253105e
--- a/lmms_eval/tasks/live_bench/live_bench.yaml
+++ b/lmms_eval/tasks/live_bench/live_bench.yaml
@@ -1,31 +1,8 @@
-dataset_path: lmms-lab/LiveBench
-dataset_kwargs:
-  token: True
-task: "live_bench"
-test_split: test
-dataset_name: 2024-07
-output_type: generate_until
-doc_to_visual: !function utils.livebench_doc_to_visual
-doc_to_text: !function utils.livebench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 1024
-  temperature: 0
-  top_p: 1.0
-  num_beams: 1
-  do_sample: false
-process_results: !function utils.livebench_process_results
-metric_list:
-  - metric: gpt4_eval_score
-    aggregation: !function utils.livebench_aggregate_results
-    higher_is_better: true
-  # - metric: gpt4_eval_score_mini
-  #   aggregation: !function utils.livebench_aggregate_results
-  #   higher_is_better: true
-model_specific_prompt_kwargs:
-  default:
-    pre_prompt: ""
-    post_prompt: ""
+group: live_bench
+task:
+- live_bench_2406
+- live_bench_2407
+
 metadata:
   api_type : openai
   eval_with_mini: false
diff --git a/lmms_eval/tasks/live_bench/live_bench_2406.yaml b/lmms_eval/tasks/live_bench/live_bench_2406.yaml
new file mode 100644
index 00000000..c82eba4c
--- /dev/null
+++ b/lmms_eval/tasks/live_bench/live_bench_2406.yaml
@@ -0,0 +1,3 @@
+task: "live_bench_2406"
+dataset_name: 2024-06
+include: live_bench_template_yaml
diff --git a/lmms_eval/tasks/live_bench/live_bench_2407.yaml b/lmms_eval/tasks/live_bench/live_bench_2407.yaml
new file mode 100644
index 00000000..d7791345
--- /dev/null
+++ b/lmms_eval/tasks/live_bench/live_bench_2407.yaml
@@ -0,0 +1,3 @@
+task: "live_bench_2407"
+dataset_name: 2024-07
+include: live_bench_template_yaml
diff --git a/lmms_eval/tasks/live_bench/live_bench_template_yaml b/lmms_eval/tasks/live_bench/live_bench_template_yaml
new file mode 100644
index 00000000..e3d9877e
--- /dev/null
+++ b/lmms_eval/tasks/live_bench/live_bench_template_yaml
@@ -0,0 +1,28 @@
+dataset_path: lmms-lab/LiveBench
+dataset_kwargs:
+  token: True
+test_split: test
+dataset_name: 2024-07
+output_type: generate_until
+doc_to_visual: !function utils.livebench_doc_to_visual
+doc_to_text: !function utils.livebench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.livebench_process_results
+process_results_use_image: true
+metric_list:
+  - metric: gpt4_eval_score
+    aggregation: !function utils.livebench_aggregate_results
+    higher_is_better: true
+  # - metric: gpt4_eval_score_mini
+  #   aggregation: !function utils.livebench_aggregate_results
+  #   higher_is_better: true
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
diff --git a/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml b/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml
index 2484f795..e18ee148 100644
--- a/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml
+++ b/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml
@@ -9,6 +9,7 @@ generation_kwargs:
   num_beams: 1
   do_sample: false
 process_results: !function utils.llava_process_results
+process_results_use_image: true
 metric_list:
   - metric: gpt_eval_llava_all
     aggregation: !function utils.llava_all_aggregation
diff --git a/lmms_eval/tasks/mathverse/mathverse_evals.py b/lmms_eval/tasks/mathverse/mathverse_evals.py
index 5894f6f7..fb72519b 100644
--- a/lmms_eval/tasks/mathverse/mathverse_evals.py
+++ b/lmms_eval/tasks/mathverse/mathverse_evals.py
@@ -265,7 +265,7 @@ def eval_results(self, results, config):
             problem = {
                 "question_type": inst["question_type"],
                 "answer": inst["answer"] if "answer" in inst else None,
-                "question_for_eval": inst["question_for_eval"],
+                "question_for_eval": inst["question"],
             }
             if config["metadata"].get("trunk_response", -1) > 0:
                 prediction = " ".join(full_prediction.split(" ")[-config["metadata"]["trunk_response"] :])
diff --git a/lmms_eval/tasks/mathverse/mathverse_testmini_vision.yaml b/lmms_eval/tasks/mathverse/mathverse_testmini_vision.yaml
new file mode 100644
index 00000000..261a23f4
--- /dev/null
+++ b/lmms_eval/tasks/mathverse/mathverse_testmini_vision.yaml
@@ -0,0 +1,10 @@
+group: mathverse_testmini_vision
+task:
+  - mathverse_testmini_vision_intensive
+  - mathverse_testmini_vision_dominant
+  - mathverse_testmini_vision_only
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-3.5-turbo"
+  trunk_response: 30
+  quick_match: false
\ No newline at end of file
diff --git a/lmms_eval/tasks/mathverse/utils.py b/lmms_eval/tasks/mathverse/utils.py
index de22d93d..4ff81ff0 100644
--- a/lmms_eval/tasks/mathverse/utils.py
+++ b/lmms_eval/tasks/mathverse/utils.py
@@ -75,18 +75,19 @@ def mathverse_aggregate_results_submission(results, args, *, calculate_gain=Fals
 
 def mathverse_aggregate_results_eval(results, args, *, calculate_gain=False, random_scores=None):
     split_flag = results[0]["metadata"]["split"]
+    problem_version = results[0]["metadata"]["problem_version"].lower().replace(" ", "_")
     # save the result first, in case the gpt evaluation fails
-    path = generate_submission_file(f"mathverse_{split_flag}_results.json", args)
+    path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_results.json", args)
     with open(path, "w") as f:
         json.dump(results, f, indent=4)
     # gpt evaluation
     results_dict, scores = mathverse_evaluator.eval_results(results, config)
     # save results
-    path = generate_submission_file(f"mathverse_{split_flag}_results.json", args)
+    path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_results.json", args)
     with open(path, "w") as f:
         json.dump(results_dict, f, indent=4)
     # save scores
-    path = generate_submission_file(f"mathverse_{split_flag}_scores.json", args)
+    path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_scores.json", args)
     with open(path, "w") as f:
         json.dump(scores, f, indent=4)
     eval_logger.info(f"Saved scores to {path}")
diff --git a/lmms_eval/tasks/mmmu/_default_template_yaml b/lmms_eval/tasks/mmmu/_default_template_yaml
new file mode 100644
index 00000000..a5367534
--- /dev/null
+++ b/lmms_eval/tasks/mmmu/_default_template_yaml
@@ -0,0 +1,6 @@
+generation_kwargs:
+  max_new_tokens: 16
+
+metadata:
+  version: 0.0
+  interleaved_format: false
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu/mmmu_test.yaml b/lmms_eval/tasks/mmmu/mmmu_test.yaml
index 8f1a1f20..03564b6a 100755
--- a/lmms_eval/tasks/mmmu/mmmu_test.yaml
+++ b/lmms_eval/tasks/mmmu/mmmu_test.yaml
@@ -7,13 +7,10 @@ doc_to_text: !function utils.mmmu_doc_to_text
 doc_to_target: "answer"
 # The return value of process_results will be used by metrics
 process_results: !function utils.mmmu_process_results
-# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
-generation_kwargs:
-  max_new_tokens: 16
-  image_aspect_ratio: original
+
 metric_list:
   - metric: submission
     aggregation: !function utils.mmmu_test_aggregate_results_for_submission
     higher_is_better: true
-metadata:
-  - version: 0.0
\ No newline at end of file
+
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu/mmmu_val.yaml b/lmms_eval/tasks/mmmu/mmmu_val.yaml
index 9e1574c4..a301f7cb 100755
--- a/lmms_eval/tasks/mmmu/mmmu_val.yaml
+++ b/lmms_eval/tasks/mmmu/mmmu_val.yaml
@@ -7,15 +7,10 @@ doc_to_text: !function utils.mmmu_doc_to_text
 doc_to_target: "answer"
 # The return value of process_results will be used by metrics
 process_results: !function utils.mmmu_process_results
-# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
-generation_kwargs:
-  max_new_tokens: 128
-model_specific_generation_kwargs:
-  llava:
-    image_aspect_ratio: original
+
 metric_list:
   - metric: mmmu_acc
     aggregation: !function utils.mmmu_aggregate_results
     higher_is_better: true
-metadata:
-  - version: 0.0
\ No newline at end of file
+
+include: _default_template_yaml
\ No newline at end of file
diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py
index 83aeed20..ffa89c3f 100755
--- a/lmms_eval/tasks/mmmu/utils.py
+++ b/lmms_eval/tasks/mmmu/utils.py
@@ -5,7 +5,8 @@
 import numpy as np
 import os
 import json
-
+from pathlib import Path
+import yaml
 
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 
@@ -14,13 +15,23 @@
 MULTI_CHOICE_PROMPT = "Answer with the option's letter from the given choices directly."
 OPEN_ENDED_PROMPT = "Answer the question using a single word or phrase."
 
+with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
 
 def replace_images_tokens(input_string):
-    # for i in range(1, 8):
-    #     question_text = f"<image {i}>"
-    #     query_text = "<image>"
-    #     if question_text in input_string:
-    #         input_string = input_string.replace(question_text, query_text)
+    for i in range(1, 8):
+        question_text = f"<image {i}>"
+        query_text = "<image>"
+        if question_text in input_string:
+            input_string = input_string.replace(question_text, query_text)
     return input_string
 
 
@@ -44,7 +55,9 @@ def construct_prompt(doc):
 
 def mmmu_doc_to_text(doc):
     question = construct_prompt(doc)
-    return replace_images_tokens(question)
+    if config["metadata"]["interleaved_format"]:
+        question = replace_images_tokens(question)
+    return question
 
 
 def mmmu_doc_to_visual(doc):
diff --git a/lmms_eval/tasks/nextqa/_default_template_yaml b/lmms_eval/tasks/nextqa/_default_template_yaml
index 65f3845a..e30dd25f 100644
--- a/lmms_eval/tasks/nextqa/_default_template_yaml
+++ b/lmms_eval/tasks/nextqa/_default_template_yaml
@@ -3,3 +3,6 @@ dataset_kwargs:
   token: True
   video: True
   cache_dir: nextqa
+metadata:
+  version: 0.0.1
+  load_package: False
\ No newline at end of file
diff --git a/lmms_eval/tasks/nextqa/utils.py b/lmms_eval/tasks/nextqa/utils.py
index 4fa46523..6556723b 100644
--- a/lmms_eval/tasks/nextqa/utils.py
+++ b/lmms_eval/tasks/nextqa/utils.py
@@ -1,40 +1,15 @@
 import os
 import yaml
-
 import random
 import pandas as pd
-
 from pathlib import Path
-
 from loguru import logger as eval_logger
 
-try:
-    from pywsd.utils import lemmatize_sentence
-except ImportError:
-    eval_logger.debug("pywsd not installed. Please install pywsd to use this module. You can install it by running 'pip install pywsd'")
-
-try:
-    from nltk.tokenize import word_tokenize
-    from nltk.corpus import wordnet
-
-    try:
-        import nltk
-
-        nltk.download("averaged_perceptron_tagger", quiet=True)
-        nltk.download("wordnet", quiet=True)
-        nltk.download("punkt", quiet=True)
-    except Exception as e:
-        eval_logger.debug(f"nltk download failed: {e}")
-except ImportError:
-    eval_logger.debug("nltk not installed. Please install nltk to use this module. You can install it by running 'pip install nltk'")
-
 from lmms_eval.tasks._task_utils.video_loader import get_cache_dir, get_video
 import numpy as np
 
-
 OPTIONS = ["A", "B", "C", "D", "E"]
 
-
 with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
     raw_data = f.readlines()
     safe_data = []
@@ -45,6 +20,23 @@
 
     config = yaml.safe_load("".join(safe_data))
 
+if config["metadata"]["load_package"]:
+    try:
+        from pywsd.utils import lemmatize_sentence
+    except ImportError:
+        eval_logger.debug("pywsd not installed. Please install pywsd to use this module. You can install it by running 'pip install pywsd'")
+
+    try:
+        from nltk.tokenize import word_tokenize
+        from nltk.corpus import wordnet
+        import nltk
+
+        nltk.download("averaged_perceptron_tagger", quiet=True)
+        nltk.download("wordnet", quiet=True)
+        nltk.download("punkt", quiet=True)
+    except ImportError:
+        eval_logger.debug("nltk not installed. Please install nltk to use this module. You can install it by running 'pip install nltk'")
+
 stopwords = set(pd.read_csv(Path(__file__).parent / "stopwords.csv").squeeze())
 
 cache_dir = get_cache_dir(config, "NExTVideo")
diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml
index 936878bf..f5562bf3 100755
--- a/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml
+++ b/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml
@@ -11,11 +11,9 @@ metric_list:
     aggregation: !function utils.videochatgpt_aggregate_consistency
     higher_is_better: true
 include: _default_template_yaml
+full_docs: true
 
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  image_aspect_ratio: original
   max_new_tokens: 1024
   temperature: 0
   top_p: 1.0
diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml
index 9affe534..4214eefd 100755
--- a/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml
+++ b/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml
@@ -17,3 +17,10 @@ metric_list:
     aggregation: !function utils.videochatgpt_aggregate_score
     higher_is_better: true
 include: _default_template_yaml
+
+generation_kwargs:
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml
index 1e207c2a..54bb57a9 100755
--- a/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml
+++ b/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml
@@ -13,9 +13,6 @@ metric_list:
 include: _default_template_yaml
 
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  image_aspect_ratio: original
   max_new_tokens: 1024
   temperature: 0
   top_p: 1.0
diff --git a/lmms_eval/tasks/wild_vision_bench/_default_template_yaml b/lmms_eval/tasks/wild_vision_bench/_default_template_yaml
index 7ce709dc..33d18d39 100644
--- a/lmms_eval/tasks/wild_vision_bench/_default_template_yaml
+++ b/lmms_eval/tasks/wild_vision_bench/_default_template_yaml
@@ -5,6 +5,7 @@ output_type: generate_until
 doc_to_visual: !function utils.wild_vision_doc_to_visual
 doc_to_text: !function utils.wild_vision_doc_to_text
 doc_to_target: !function utils.wild_vision_doc_to_target
+process_results_use_image: true
 generation_kwargs:
   max_new_tokens: 4096
   temperature: 0