diff --git a/README.md b/README.md index 17d5e31e..d31451eb 100755 --- a/README.md +++ b/README.md @@ -17,10 +17,11 @@ --- ## Annoucement +- [2024-09] 🎉🎉 We welcome the new task [MME-RealWorld](https://mme-realworld.github.io/) for inference acceleration - [2024-09] ⚙⚙ We upgrade `lmms-eval` to `0.2.3` with more tasks and features. We support a compact set of language tasks evaluations (code credit to [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)), and we remove the registration logic at start (for all models and tasks) to reduce the overhead. Now `lmms-eval` only launches necessary tasks/models. Please check the [release notes](https://github.com/EvolvingLMMs-Lab/lmms-eval/releases/tag/v0.2.3) for more details. - [2024-08] 🎉🎉 We welcome the new model [LLaVA-OneVision](https://huggingface.co/papers/2408.03326), [Mantis](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/162), new tasks [MVBench](https://huggingface.co/datasets/OpenGVLab/MVBench), [LongVideoBench](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/117), [MMStar](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/158). We provide new feature of SGlang Runtime API for llava-onevision model, please refer the [doc](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/docs/commands.md) for inference acceleration - [2024-07] 🎉🎉 We have released the [technical report](https://arxiv.org/abs/2407.12772) and [LiveBench](https://huggingface.co/spaces/lmms-lab/LiveBench)! -- [2024-07] 👚‍💻👚‍💻 The `lmms-eval/v0.2.1` has been upgraded to support more models, including [LongVA](https://github.com/EvolvingLMMs-Lab/LongVA), [InterVL-2](https://github.com/OpenGVLab/InternVL), [VILA](https://github.com/NVlabs/VILA), and many more evaluation tasks, e.g. [Details Captions](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/136), [MLVU](https://arxiv.org/abs/2406.04264), [WildVision-Bench](https://huggingface.co/datasets/WildVision/wildvision-arena-data), [VITATECS](https://github.com/lscpku/VITATECS) and [LLaVA-Interleave-Bench](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/). +- [2024-07] 👚‍💻👚‍💻 The `lmms-eval/v0.2.1` has been upgraded to support more models, including [LongVA](https://github.com/EvolvingLMMs-Lab/LongVA), [InternVL-2](https://github.com/OpenGVLab/InternVL), [VILA](https://github.com/NVlabs/VILA), and many more evaluation tasks, e.g. [Details Captions](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/136), [MLVU](https://arxiv.org/abs/2406.04264), [WildVision-Bench](https://huggingface.co/datasets/WildVision/wildvision-arena-data), [VITATECS](https://github.com/lscpku/VITATECS) and [LLaVA-Interleave-Bench](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/). - [2024-06] 🎬🎬 The `lmms-eval/v0.2.0` has been upgraded to support video evaluations for video models like LLaVA-NeXT Video and Gemini 1.5 Pro across tasks such as EgoSchema, PerceptionTest, VideoMME, and more. Please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.2/) for more details diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py index 98258c16..085adba0 100755 --- a/lmms_eval/api/task.py +++ b/lmms_eval/api/task.py @@ -1023,6 +1023,7 @@ def concat_tar_parts(tar_parts, output_tar): download_config=download_config, **dataset_kwargs if dataset_kwargs is not None else {}, ) + if self.config.process_docs is not None: for split in self.dataset: if split in [self.config.training_split, self.config.validation_split, self.config.test_split, self.config.fewshot_split]: diff --git a/lmms_eval/tasks/mme_realworld/mme_realworld.yaml b/lmms_eval/tasks/mme_realworld/mme_realworld.yaml new file mode 100644 index 00000000..50f2d3c0 --- /dev/null +++ b/lmms_eval/tasks/mme_realworld/mme_realworld.yaml @@ -0,0 +1,37 @@ +dataset_path: yifanzhang114/MME-RealWorld-Lmms-eval +dataset_kwargs: + token: True + cache_dir: mmerealworld + video: True + # From_YouTube: True +task: mmerealworld +test_split: train +output_type: generate_until +doc_to_visual: !function utils.mme_realworld_doc_to_visual +doc_to_text: !function utils.mme_realworld_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +# The return value of process_results will be used by metrics +process_results: !function utils.mme_realworld_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: mme_realworld_score + aggregation: !function utils.mme_realworld_aggregate_results + higher_is_better: true +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "\nSelect the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option." + gpt4v: + pre_prompt: "" + post_prompt: "\nSelect the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option." + xcomposer2_4khd: + pre_prompt: "[UNUSED_TOKEN_146]user\n" + post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n" +metadata: + - version: 0.0 diff --git a/lmms_eval/tasks/mme_realworld/mme_realworld_cn.yaml b/lmms_eval/tasks/mme_realworld/mme_realworld_cn.yaml new file mode 100644 index 00000000..e4dcda62 --- /dev/null +++ b/lmms_eval/tasks/mme_realworld/mme_realworld_cn.yaml @@ -0,0 +1,37 @@ +dataset_path: yifanzhang114/MME-RealWorld-CN-Lmms-eval +dataset_kwargs: + token: True + cache_dir: mmerealworld + video: True + # From_YouTube: True +task: mmerealworld_cn +test_split: train +output_type: generate_until +doc_to_visual: !function utils.mme_realworld_doc_to_visual +doc_to_text: !function utils.mme_realworld_cn_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +# The return value of process_results will be used by metrics +process_results: !function utils.mme_realworld_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: mme_realworld_score + aggregation: !function utils.mme_realworld_aggregate_results + higher_is_better: true +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "\n根据囟像选择䞊述倚项选择题的最䜳答案。只需回答正确选项的字母A, B, C, D 或 E。" + gpt4v: + pre_prompt: "" + post_prompt: "\n根据囟像选择䞊述倚项选择题的最䜳答案。只需回答正确选项的字母A, B, C, D 或 E。" + xcomposer2_4khd: + pre_prompt: "[UNUSED_TOKEN_146]user\n" + post_prompt: " 根据囟像选择䞊述倚项选择题的最䜳答案。只需回答正确选项的字母A, B, C, D 或 E。[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n" +metadata: + - version: 0.0 diff --git a/lmms_eval/tasks/mme_realworld/utils.py b/lmms_eval/tasks/mme_realworld/utils.py new file mode 100644 index 00000000..39e96105 --- /dev/null +++ b/lmms_eval/tasks/mme_realworld/utils.py @@ -0,0 +1,186 @@ +import datetime +import json +import os +import re +import sys +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Optional, Union + +import cv2 +import numpy as np +import yaml +from loguru import logger as eval_logger + +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +TASKS = [ + "Reasoning", + "Perception", +] + +SUBTASKS = [ + "Monitoring", + "Autonomous_Driving", + "OCR with Complex Context", + "Diagram and Table", + "Remote Sensing", +] + + +def mme_realworld_doc_to_visual(doc): + img = decode_base64_to_image(doc["bytes"]) + return [img.convert("RGB")] + + +import base64 +import io + +from PIL import Image + + +def decode_base64_to_image(base64_string, target_size=-1): + image_data = base64.b64decode(base64_string) + image = Image.open(io.BytesIO(image_data)) + if image.mode in ("RGBA", "P"): + image = image.convert("RGB") + if target_size > 0: + image.thumbnail((target_size, target_size)) + return image + + +def mme_realworld_doc_to_text(doc, lmms_eval_specific_kwargs=None): + question = doc["question"] + option_prompt = "The choices are listed below:\n" + "\n".join(doc["multi-choice options"]) + "\n" + + question += " " + option_prompt + "Select the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option.\nThe best answer is: " + return question + + +def mme_realworld_cn_doc_to_text(doc, lmms_eval_specific_kwargs=None): + question = doc["question"] + option_prompt = "选项劂䞋所瀺:\n" + "\n".join(doc["multi-choice options"]) + "\n" + + question += " " + option_prompt + "根据囟像选择䞊述倚项选择题的最䜳答案。只需回答正确选项的字母A, B, C, D 或 E。\n最䜳答案䞺 " + return question + + +# [Image] [Question] The choices are listed below: +# (A) [Choice A] +# (B) [Choice B] +# (C) [Choice C] +# (D) [Choice D] +# (E) [Choice E] +# Select the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option. +# The best answer is: + + +def extract_characters_regex(s, choices=["(A)", "(B)", "(C)", "(D)", "(E)"]): + if type(s) is dict: + s = "" + s = s.strip() + answer_prefixes = [ + "The best answer is", + "The correct answer is", + "The answer is", + "The answer", + "The best option is" "The correct option is", + "Best answer:" "Best option:", + ] + for answer_prefix in answer_prefixes: + s = s.replace(answer_prefix, "") + + if len(s.split()) > 10 and not re.search("[ABCDE]", s): + return "" + matches = re.search(r"[ABCDE]", s) + if matches is None: + for choice in choices: + if s.lower() in choice.lower(): + return choice[1] + return "" + return matches[0] + + +def mme_realworld_process_results(doc, results): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case mme_realworld score), value: metric value + """ + pred = results[0] + pred_ans = extract_characters_regex(pred) + # gt_ans = doc["answer"].lower().strip().replace(".", "") + + category = "Perception" if "perception" in doc["category"].lower() else "Reasoning" + sub_category = doc["category"].split("/")[-1] + task_category = doc["l2-category"] + data_dict = {"question_id": doc["index"], "category": category, "sub_category": sub_category, "task_category": task_category, "pred_answer": pred_ans, "answer": doc["answer"]} + + # return {f"mme_realworld_percetion_score": data_dict for metric in matrices} + return {f"mme_realworld_score": data_dict} + + +def mme_realworld_aggregate_results(results): + """ + Args: + results: a list of values returned by process_results + Returns: + A score + """ + + metrics = {} + for task in TASKS: + metrics[f"{task}"] = {} + for subtask in SUBTASKS: + metrics[f"{task}"][f"{subtask}"] = {} + + for i in range(len(results)): + result = results[i] + Task = result["category"] + Subtask = result["sub_category"] + Category = result["task_category"].lower() + if "attribute" in Category.lower(): + Category = Category.split("/")[0] + "/attribute" + cnt = result["pred_answer"].lower() == result["answer"].lower() + if Category not in metrics[Task][Subtask].keys(): + metrics[Task][Subtask][f"{Category}"] = {"true": cnt, "false": 1 - cnt, "is_E": result["pred_answer"] == "E"} + else: + metrics[Task][Subtask][f"{Category}"]["true"] += cnt + metrics[Task][Subtask][f"{Category}"]["false"] += 1 - cnt + metrics[Task][Subtask][f"{Category}"]["is_E"] += result["pred_answer"] == "E" + + sum_all, succ_all = 0, 0 + for task, tasks_values in metrics.items(): + eval_logger.info(f"*" * 32 + f"{task} (Task Start)") + cnt_task, cnt_E, sum_task = 0, 0, 0 + for substask, subtask_value in tasks_values.items(): + eval_logger.info(f"+" * 16 + f"{substask} (Subtask Start)") + cnt_subtask, sum_subtask, e_subtask = 0, 0, 0 + for category, category_dict in subtask_value.items(): + cnt_subtask += category_dict["true"] + sum_subtask += category_dict["false"] + category_dict["true"] + e_subtask += category_dict["is_E"] + acc = category_dict["true"] / (category_dict["false"] + category_dict["true"]) + eval_logger.info(f"-" * 4 + f"\t" + "Acc " + "{:.4f}".format(acc) + f"\t{category.capitalize()} ({category_dict['false'] + category_dict['true']} items)") + + if sum_subtask == 0: + acc_subtasks = 0 + e_subtask = 0 + else: + acc_subtasks = cnt_subtask / sum_subtask + eval_logger.info(f"+" * 16 + f"\t Acc " + "{:.4f}".format(acc_subtasks) + f"\t E choice {e_subtask} \t{substask} ({sum_subtask} items)") + cnt_task += cnt_subtask + sum_task += sum_subtask + cnt_E += e_subtask + + if sum_task == 0: + acc_task = 0 + else: + acc_task = cnt_task / sum_task + succ_all += cnt_task + sum_all += sum_task + eval_logger.info(f"*" * 32 + f"Acc " + "{:.4f}".format(acc_task) + f"\t E choice {cnt_E} \t{task} ({sum_task} items)\n") + eval_logger.info(f"*" * 32 + f"Overall Acc " + "{:.4f}".format(succ_all / sum_all)) + return succ_all / sum_all