diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py index 5560745c..f488bb22 100755 --- a/lmms_eval/api/task.py +++ b/lmms_eval/api/task.py @@ -67,8 +67,10 @@ class TaskConfig(dict): validation_split: str = None test_split: str = None fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?) + full_docs: bool = False # formatting / prompting options. # see docs/advanced_task_guide.md for more info + process_results_use_image: bool = False process_docs: Callable = None doc_to_visual: Union[Callable, str] = None doc_to_text: Union[Callable, str] = None diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py index 48397a0a..0e890533 100755 --- a/lmms_eval/evaluator.py +++ b/lmms_eval/evaluator.py @@ -327,7 +327,7 @@ def evaluate( # hack: remove image columns to speed avoid loading images and speed up postprocessing # reason: doc_iterator will actually load image if it's in the doc. docs = task.test_docs() if task.has_test_docs() else task.validation_docs() - if "d170" not in task_name and "dc100" not in task_name and "dc200" not in task_name and "llava_wilder" not in task_name and "live_bench" not in task_name and "wildvision" not in task_name: + if not task.config["process_results_use_image"]: remove_cols = [] features = docs.features # If it is an Image instance or a Sequence of Image instance. Remove it @@ -340,10 +340,7 @@ def evaluate( docs = docs.remove_columns(remove_cols) ####################### Processing with Full Docs Mode ####################### - if task_name in ["videochatgpt_consistency"]: - full_docs = True - else: - full_docs = False + full_docs = task.config["full_docs"] doc_iterator = itertools.islice(enumerate(docs), lm.rank, limit, lm.world_size) # Instead of converting the iterator to a list, use `itertools.tee` to create a parallel iterator for counting diff --git a/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml b/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml index cd692a1d..5281ad1d 100755 --- a/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml +++ b/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml @@ -1,4 +1,5 @@ model_specific_prompt_kwargs: default: pre_prompt: "" - post_prompt: "" \ No newline at end of file + post_prompt: "" +process_results_use_image: true diff --git a/lmms_eval/tasks/live_bench/live_bench.yaml b/lmms_eval/tasks/live_bench/live_bench.yaml old mode 100644 new mode 100755 index b4a09853..1253105e --- a/lmms_eval/tasks/live_bench/live_bench.yaml +++ b/lmms_eval/tasks/live_bench/live_bench.yaml @@ -1,31 +1,8 @@ -dataset_path: lmms-lab/LiveBench -dataset_kwargs: - token: True -task: "live_bench" -test_split: test -dataset_name: 2024-07 -output_type: generate_until -doc_to_visual: !function utils.livebench_doc_to_visual -doc_to_text: !function utils.livebench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 1024 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -process_results: !function utils.livebench_process_results -metric_list: - - metric: gpt4_eval_score - aggregation: !function utils.livebench_aggregate_results - higher_is_better: true - # - metric: gpt4_eval_score_mini - # aggregation: !function utils.livebench_aggregate_results - # higher_is_better: true -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "" +group: live_bench +task: +- live_bench_2406 +- live_bench_2407 + metadata: api_type : openai eval_with_mini: false diff --git a/lmms_eval/tasks/live_bench/live_bench_2406.yaml b/lmms_eval/tasks/live_bench/live_bench_2406.yaml new file mode 100644 index 00000000..c82eba4c --- /dev/null +++ b/lmms_eval/tasks/live_bench/live_bench_2406.yaml @@ -0,0 +1,3 @@ +task: "live_bench_2406" +dataset_name: 2024-06 +include: live_bench_template_yaml diff --git a/lmms_eval/tasks/live_bench/live_bench_2407.yaml b/lmms_eval/tasks/live_bench/live_bench_2407.yaml new file mode 100644 index 00000000..d7791345 --- /dev/null +++ b/lmms_eval/tasks/live_bench/live_bench_2407.yaml @@ -0,0 +1,3 @@ +task: "live_bench_2407" +dataset_name: 2024-07 +include: live_bench_template_yaml diff --git a/lmms_eval/tasks/live_bench/live_bench_template_yaml b/lmms_eval/tasks/live_bench/live_bench_template_yaml new file mode 100644 index 00000000..e3d9877e --- /dev/null +++ b/lmms_eval/tasks/live_bench/live_bench_template_yaml @@ -0,0 +1,28 @@ +dataset_path: lmms-lab/LiveBench +dataset_kwargs: + token: True +test_split: test +dataset_name: 2024-07 +output_type: generate_until +doc_to_visual: !function utils.livebench_doc_to_visual +doc_to_text: !function utils.livebench_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 1024 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +process_results: !function utils.livebench_process_results +process_results_use_image: true +metric_list: + - metric: gpt4_eval_score + aggregation: !function utils.livebench_aggregate_results + higher_is_better: true + # - metric: gpt4_eval_score_mini + # aggregation: !function utils.livebench_aggregate_results + # higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" diff --git a/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml b/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml index 2484f795..e18ee148 100644 --- a/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml +++ b/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml @@ -9,6 +9,7 @@ generation_kwargs: num_beams: 1 do_sample: false process_results: !function utils.llava_process_results +process_results_use_image: true metric_list: - metric: gpt_eval_llava_all aggregation: !function utils.llava_all_aggregation diff --git a/lmms_eval/tasks/mathverse/mathverse_evals.py b/lmms_eval/tasks/mathverse/mathverse_evals.py index 5894f6f7..fb72519b 100644 --- a/lmms_eval/tasks/mathverse/mathverse_evals.py +++ b/lmms_eval/tasks/mathverse/mathverse_evals.py @@ -265,7 +265,7 @@ def eval_results(self, results, config): problem = { "question_type": inst["question_type"], "answer": inst["answer"] if "answer" in inst else None, - "question_for_eval": inst["question_for_eval"], + "question_for_eval": inst["question"], } if config["metadata"].get("trunk_response", -1) > 0: prediction = " ".join(full_prediction.split(" ")[-config["metadata"]["trunk_response"] :]) diff --git a/lmms_eval/tasks/mathverse/mathverse_testmini_vision.yaml b/lmms_eval/tasks/mathverse/mathverse_testmini_vision.yaml new file mode 100644 index 00000000..261a23f4 --- /dev/null +++ b/lmms_eval/tasks/mathverse/mathverse_testmini_vision.yaml @@ -0,0 +1,10 @@ +group: mathverse_testmini_vision +task: + - mathverse_testmini_vision_intensive + - mathverse_testmini_vision_dominant + - mathverse_testmini_vision_only +metadata: + version: 0.0 + gpt_eval_model_name: "gpt-3.5-turbo" + trunk_response: 30 + quick_match: false \ No newline at end of file diff --git a/lmms_eval/tasks/mathverse/utils.py b/lmms_eval/tasks/mathverse/utils.py index de22d93d..4ff81ff0 100644 --- a/lmms_eval/tasks/mathverse/utils.py +++ b/lmms_eval/tasks/mathverse/utils.py @@ -75,18 +75,19 @@ def mathverse_aggregate_results_submission(results, args, *, calculate_gain=Fals def mathverse_aggregate_results_eval(results, args, *, calculate_gain=False, random_scores=None): split_flag = results[0]["metadata"]["split"] + problem_version = results[0]["metadata"]["problem_version"].lower().replace(" ", "_") # save the result first, in case the gpt evaluation fails - path = generate_submission_file(f"mathverse_{split_flag}_results.json", args) + path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_results.json", args) with open(path, "w") as f: json.dump(results, f, indent=4) # gpt evaluation results_dict, scores = mathverse_evaluator.eval_results(results, config) # save results - path = generate_submission_file(f"mathverse_{split_flag}_results.json", args) + path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_results.json", args) with open(path, "w") as f: json.dump(results_dict, f, indent=4) # save scores - path = generate_submission_file(f"mathverse_{split_flag}_scores.json", args) + path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_scores.json", args) with open(path, "w") as f: json.dump(scores, f, indent=4) eval_logger.info(f"Saved scores to {path}") diff --git a/lmms_eval/tasks/mmmu/_default_template_yaml b/lmms_eval/tasks/mmmu/_default_template_yaml new file mode 100644 index 00000000..a5367534 --- /dev/null +++ b/lmms_eval/tasks/mmmu/_default_template_yaml @@ -0,0 +1,6 @@ +generation_kwargs: + max_new_tokens: 16 + +metadata: + version: 0.0 + interleaved_format: false \ No newline at end of file diff --git a/lmms_eval/tasks/mmmu/mmmu_test.yaml b/lmms_eval/tasks/mmmu/mmmu_test.yaml index 8f1a1f20..03564b6a 100755 --- a/lmms_eval/tasks/mmmu/mmmu_test.yaml +++ b/lmms_eval/tasks/mmmu/mmmu_test.yaml @@ -7,13 +7,10 @@ doc_to_text: !function utils.mmmu_doc_to_text doc_to_target: "answer" # The return value of process_results will be used by metrics process_results: !function utils.mmmu_process_results -# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results -generation_kwargs: - max_new_tokens: 16 - image_aspect_ratio: original + metric_list: - metric: submission aggregation: !function utils.mmmu_test_aggregate_results_for_submission higher_is_better: true -metadata: - - version: 0.0 \ No newline at end of file + +include: _default_template_yaml \ No newline at end of file diff --git a/lmms_eval/tasks/mmmu/mmmu_val.yaml b/lmms_eval/tasks/mmmu/mmmu_val.yaml index 9e1574c4..a301f7cb 100755 --- a/lmms_eval/tasks/mmmu/mmmu_val.yaml +++ b/lmms_eval/tasks/mmmu/mmmu_val.yaml @@ -7,15 +7,10 @@ doc_to_text: !function utils.mmmu_doc_to_text doc_to_target: "answer" # The return value of process_results will be used by metrics process_results: !function utils.mmmu_process_results -# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results -generation_kwargs: - max_new_tokens: 128 -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original + metric_list: - metric: mmmu_acc aggregation: !function utils.mmmu_aggregate_results higher_is_better: true -metadata: - - version: 0.0 \ No newline at end of file + +include: _default_template_yaml \ No newline at end of file diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py index 83aeed20..ffa89c3f 100755 --- a/lmms_eval/tasks/mmmu/utils.py +++ b/lmms_eval/tasks/mmmu/utils.py @@ -5,7 +5,8 @@ import numpy as np import os import json - +from pathlib import Path +import yaml from lmms_eval.tasks._task_utils.file_utils import generate_submission_file @@ -14,13 +15,23 @@ MULTI_CHOICE_PROMPT = "Answer with the option's letter from the given choices directly." OPEN_ENDED_PROMPT = "Answer the question using a single word or phrase." +with open(Path(__file__).parent / "_default_template_yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + def replace_images_tokens(input_string): - # for i in range(1, 8): - # question_text = f"" - # query_text = "" - # if question_text in input_string: - # input_string = input_string.replace(question_text, query_text) + for i in range(1, 8): + question_text = f"" + query_text = "" + if question_text in input_string: + input_string = input_string.replace(question_text, query_text) return input_string @@ -44,7 +55,9 @@ def construct_prompt(doc): def mmmu_doc_to_text(doc): question = construct_prompt(doc) - return replace_images_tokens(question) + if config["metadata"]["interleaved_format"]: + question = replace_images_tokens(question) + return question def mmmu_doc_to_visual(doc): diff --git a/lmms_eval/tasks/nextqa/_default_template_yaml b/lmms_eval/tasks/nextqa/_default_template_yaml index 65f3845a..e30dd25f 100644 --- a/lmms_eval/tasks/nextqa/_default_template_yaml +++ b/lmms_eval/tasks/nextqa/_default_template_yaml @@ -3,3 +3,6 @@ dataset_kwargs: token: True video: True cache_dir: nextqa +metadata: + version: 0.0.1 + load_package: False \ No newline at end of file diff --git a/lmms_eval/tasks/nextqa/utils.py b/lmms_eval/tasks/nextqa/utils.py index 4fa46523..6556723b 100644 --- a/lmms_eval/tasks/nextqa/utils.py +++ b/lmms_eval/tasks/nextqa/utils.py @@ -1,40 +1,15 @@ import os import yaml - import random import pandas as pd - from pathlib import Path - from loguru import logger as eval_logger -try: - from pywsd.utils import lemmatize_sentence -except ImportError: - eval_logger.debug("pywsd not installed. Please install pywsd to use this module. You can install it by running 'pip install pywsd'") - -try: - from nltk.tokenize import word_tokenize - from nltk.corpus import wordnet - - try: - import nltk - - nltk.download("averaged_perceptron_tagger", quiet=True) - nltk.download("wordnet", quiet=True) - nltk.download("punkt", quiet=True) - except Exception as e: - eval_logger.debug(f"nltk download failed: {e}") -except ImportError: - eval_logger.debug("nltk not installed. Please install nltk to use this module. You can install it by running 'pip install nltk'") - from lmms_eval.tasks._task_utils.video_loader import get_cache_dir, get_video import numpy as np - OPTIONS = ["A", "B", "C", "D", "E"] - with open(Path(__file__).parent / "_default_template_yaml", "r") as f: raw_data = f.readlines() safe_data = [] @@ -45,6 +20,23 @@ config = yaml.safe_load("".join(safe_data)) +if config["metadata"]["load_package"]: + try: + from pywsd.utils import lemmatize_sentence + except ImportError: + eval_logger.debug("pywsd not installed. Please install pywsd to use this module. You can install it by running 'pip install pywsd'") + + try: + from nltk.tokenize import word_tokenize + from nltk.corpus import wordnet + import nltk + + nltk.download("averaged_perceptron_tagger", quiet=True) + nltk.download("wordnet", quiet=True) + nltk.download("punkt", quiet=True) + except ImportError: + eval_logger.debug("nltk not installed. Please install nltk to use this module. You can install it by running 'pip install nltk'") + stopwords = set(pd.read_csv(Path(__file__).parent / "stopwords.csv").squeeze()) cache_dir = get_cache_dir(config, "NExTVideo") diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml index 936878bf..f5562bf3 100755 --- a/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml @@ -11,11 +11,9 @@ metric_list: aggregation: !function utils.videochatgpt_aggregate_consistency higher_is_better: true include: _default_template_yaml +full_docs: true generation_kwargs: - until: - - "ASSISTANT:" - image_aspect_ratio: original max_new_tokens: 1024 temperature: 0 top_p: 1.0 diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml index 9affe534..4214eefd 100755 --- a/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml @@ -17,3 +17,10 @@ metric_list: aggregation: !function utils.videochatgpt_aggregate_score higher_is_better: true include: _default_template_yaml + +generation_kwargs: + max_new_tokens: 1024 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml index 1e207c2a..54bb57a9 100755 --- a/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml @@ -13,9 +13,6 @@ metric_list: include: _default_template_yaml generation_kwargs: - until: - - "ASSISTANT:" - image_aspect_ratio: original max_new_tokens: 1024 temperature: 0 top_p: 1.0 diff --git a/lmms_eval/tasks/wild_vision_bench/_default_template_yaml b/lmms_eval/tasks/wild_vision_bench/_default_template_yaml index 7ce709dc..33d18d39 100644 --- a/lmms_eval/tasks/wild_vision_bench/_default_template_yaml +++ b/lmms_eval/tasks/wild_vision_bench/_default_template_yaml @@ -5,6 +5,7 @@ output_type: generate_until doc_to_visual: !function utils.wild_vision_doc_to_visual doc_to_text: !function utils.wild_vision_doc_to_text doc_to_target: !function utils.wild_vision_doc_to_target +process_results_use_image: true generation_kwargs: max_new_tokens: 4096 temperature: 0