From 8e21e95b8c7fd3d368335d74b2fa56b89872be26 Mon Sep 17 00:00:00 2001 From: evergreen-bupt <609531932@qq.com> Date: Mon, 7 Oct 2024 02:24:39 +0800 Subject: [PATCH] update 10.7 pure --- .gitignore | 5 + lmms_eval/models/__init__.py | 9 +- lmms_eval/models/cambrian_8b.py | 438 ------------------ .../models/{cambrian.py => cambrian_model.py} | 42 +- lmms_eval/models/emu2.py | 31 -- lmms_eval/models/gemma.py | 2 +- lmms_eval/models/gpt4v_01.py | 202 ++++++++ lmms_eval/models/gpt4v_01_batch.py | 224 +++++++++ lmms_eval/models/idefics2.py | 8 - lmms_eval/models/internvl2.py | 11 - lmms_eval/models/minicpm_v.py | 4 - lmms_eval/models/paligemma.py | 66 --- lmms_eval/models/qwen.py | 2 +- lmms_eval/models/test_swwu.py | 78 ---- lmms_eval/models/yi.py | 2 +- lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml | 4 - lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml | 4 - lmms_eval/tasks/chartqa_all_vd/chartqa.yaml | 4 - lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml | 4 - lmms_eval/tasks/mmmu/mmmu_aug.yaml | 2 +- lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml | 2 +- .../tasks/ocrbench_all_vd/ocrbench_suit.yaml | 4 - .../tasks/ocrbench_suit/ocrbench_suit.yaml | 4 - .../tasks/ocrbench_suit_vd/ocrbench_suit.yaml | 4 - .../tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml | 4 - .../tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml | 4 - .../tasks/siwei_bench_sub1/siwei_bench.yaml | 3 - .../siwei_bench_sub1/siwei_bench_layout.yaml | 22 - lmms_eval/tasks/siwei_bench_sub1/utils.py | 96 ---- .../siwei_bench_sub1_vd/siwei_bench.yaml | 3 - .../siwei_bench_layout.yaml | 26 -- lmms_eval/tasks/siwei_bench_sub1_vd/utils.py | 97 ---- .../tasks/siwei_bench_sub2/siwei_bench.yaml | 9 - .../siwei_bench_Environment.yaml | 26 -- .../siwei_bench_sub2/siwei_bench_MadeOf.yaml | 26 -- .../siwei_bench_Used_For.yaml | 26 -- .../siwei_bench_atlocation.yaml | 26 -- .../siwei_bench_sub2/siwei_bench_nearby.yaml | 26 -- .../siwei_bench_sub2/siwei_bench_partof.yaml | 26 -- lmms_eval/tasks/siwei_bench_sub2/utils.py | 96 ---- .../siwei_bench_sub2_shuffle/siwei_bench.yaml | 7 - .../siwei_bench_Environment.yaml | 30 -- .../siwei_bench_MadeOf.yaml | 30 -- .../siwei_bench_Used_For.yaml | 30 -- .../siwei_bench_atlocation.yaml | 30 -- .../siwei_bench_nearby.yaml | 30 -- .../siwei_bench_partof.yaml | 30 -- .../tasks/siwei_bench_sub2_shuffle/utils.py | 96 ---- .../siwei_bench_sub2_vd/siwei_bench.yaml | 9 - .../siwei_bench_Environment.yaml | 30 -- .../siwei_bench_MadeOf.yaml | 30 -- .../siwei_bench_Used_For.yaml | 30 -- .../siwei_bench_atlocation.yaml | 30 -- .../siwei_bench_nearby.yaml | 30 -- .../siwei_bench_partof.yaml | 30 -- lmms_eval/tasks/siwei_bench_sub2_vd/utils.py | 97 ---- .../tasks/siwei_bench_sub3/siwei_bench.yaml | 7 - .../siwei_bench_hasproperty.yaml | 36 -- .../siwei_bench_shapesimilarto.yaml | 20 - .../siwei_bench_similar_event.yaml | 20 - .../siwei_bench_subevent.yaml | 20 - lmms_eval/tasks/siwei_bench_sub3/utils.py | 138 ------ .../siwei_bench_sub3_shuffle/siwei_bench.yaml | 5 - .../siwei_bench_hasproperty.yaml | 38 -- .../siwei_bench_shapesimilarto.yaml | 22 - .../siwei_bench_similar_event.yaml | 22 - .../siwei_bench_subevent.yaml | 24 - .../tasks/siwei_bench_sub3_shuffle/utils.py | 138 ------ .../siwei_bench_sub3_vd/siwei_bench.yaml | 7 - .../siwei_bench_hasproperty.yaml | 38 -- .../siwei_bench_shapesimilarto.yaml | 22 - .../siwei_bench_similar_event.yaml | 22 - .../siwei_bench_subevent.yaml | 24 - lmms_eval/tasks/siwei_bench_sub3_vd/utils.py | 140 ------ test.py | 6 - test_blip.py | 0 test_cambrian.py | 8 - visual_code/echart.js | 145 ++++++ visual_code/echart_new.js | 283 +++++++++++ visual_code/plot_map.py | 108 +++++ 80 files changed, 989 insertions(+), 2545 deletions(-) delete mode 100644 lmms_eval/models/cambrian_8b.py rename lmms_eval/models/{cambrian.py => cambrian_model.py} (93%) create mode 100755 lmms_eval/models/gpt4v_01.py create mode 100755 lmms_eval/models/gpt4v_01_batch.py delete mode 100644 lmms_eval/models/test_swwu.py delete mode 100755 lmms_eval/tasks/siwei_bench_sub1/siwei_bench.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub1/siwei_bench_layout.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub1/utils.py delete mode 100755 lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench_layout.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub1_vd/utils.py delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Environment.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_MadeOf.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Used_For.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_atlocation.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_nearby.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_partof.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/utils.py delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Environment.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_MadeOf.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Used_For.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_atlocation.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_nearby.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_partof.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/utils.py delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Environment.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_MadeOf.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Used_For.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_atlocation.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_nearby.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_partof.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/utils.py delete mode 100755 lmms_eval/tasks/siwei_bench_sub3/siwei_bench.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3/siwei_bench_hasproperty.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub3/siwei_bench_shapesimilarto.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3/siwei_bench_similar_event.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3/siwei_bench_subevent.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub3/utils.py delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_hasproperty.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_shapesimilarto.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_similar_event.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_subevent.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_shuffle/utils.py delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_hasproperty.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_shapesimilarto.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_similar_event.yaml delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_subevent.yaml delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_vd/utils.py delete mode 100644 test.py delete mode 100644 test_blip.py delete mode 100644 test_cambrian.py create mode 100644 visual_code/echart.js create mode 100644 visual_code/echart_new.js create mode 100644 visual_code/plot_map.py diff --git a/.gitignore b/.gitignore index 1188fd9dd..b04121242 100755 --- a/.gitignore +++ b/.gitignore @@ -104,3 +104,8 @@ describe/ clip-vit-base-patch32/ ai2d_check/ lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc +data_clean/ +gpt_4v/ +.js +clip-vit-base-patch32/ +*.whl \ No newline at end of file diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index bd8ae7dde..52e147672 100755 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -46,24 +46,17 @@ "qwen":"Qwen", 'llama':"Llama", "gemma":"Gemma", -<<<<<<< HEAD - "cambrian":"Cambrian", -======= - "cambrian_8b":"Cambrian_8b", ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e + "cambrian_model":"Cambrian", "internvl2": "InternVL2", "mantis": "Mantis", "emu2": "Emu2", "paligemma":"Paligemma", -<<<<<<< HEAD "internvl2_large":"InternVL2_large", "MIO_sft":"MIO", "onevision":"onevision", "onevision_large":"onevision_large", "cogvlm2":"cogvlm2", "MIO_batch":"MIO_batch" -======= ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e } for model_name, model_class in AVAILABLE_MODELS.items(): diff --git a/lmms_eval/models/cambrian_8b.py b/lmms_eval/models/cambrian_8b.py deleted file mode 100644 index 6fb730174..000000000 --- a/lmms_eval/models/cambrian_8b.py +++ /dev/null @@ -1,438 +0,0 @@ -import torch -import os -import sys -from tqdm import tqdm -from lmms_eval import utils -from lmms_eval.api.instance import Instance -from lmms_eval.api.model import lmms -from PIL import Image -from datetime import timedelta -from lmms_eval.api.registry import register_model -# from lmms_eval.models.model_utils.qwen.qwen_generate_utils import make_context -# 获取当前文件的目录 -# current_dir = os.path.dirname(os.path.abspath(__file__)) -# # 获取同级目录的路径 -# parent_dir = os.path.dirname(current_dir) - -# # 将同级目录添加到模块搜索路径 -# sys.path.append(parent_dir) -# sys.path.append(current_dir) -# os.chdir('/ML-A100/team/mm/zk/lmms-eval/lmms_eval/models/cambrian') -from cambrian.model.builder import load_pretrained_model -from cambrian.conversation import conv_templates, SeparatorStyle -from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs -from accelerate.state import AcceleratorState -from typing import List, Optional, Union, Tuple -import uuid -import warnings -from transformers import PreTrainedTokenizer -temperature = 0 -CONTROLLER_HEART_BEAT_EXPIRATION = 30 -WORKER_HEART_BEAT_INTERVAL = 15 - -LOGDIR = "." - -# Model Constants -IGNORE_INDEX = -100 -IMAGE_TOKEN_INDEX = -200 -DEFAULT_IMAGE_TOKEN = "" -DEFAULT_IMAGE_PATCH_TOKEN = "" -DEFAULT_IM_START_TOKEN = "" -DEFAULT_IM_END_TOKEN = "" -IMAGE_PLACEHOLDER = "" - - -conv_mode = "llama_3" - -warnings.simplefilter("ignore", category=DeprecationWarning) -warnings.filterwarnings("ignore") - -from loguru import logger as eval_logger -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel - -def process(image, question, tokenizer, image_processor, model_config): - qs = question - - if model_config.mm_use_im_start_end: - qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs - else: - qs = DEFAULT_IMAGE_TOKEN + '\n' + str(qs) - - conv = conv_templates[conv_mode].copy() - conv.append_message(conv.roles[0], qs) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - image_size = [image.size] - image_tensor = process_images([image], image_processor, model_config) - - input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() - - return input_ids, image_tensor, image_size, prompt - -def make_context( - tokenizer: PreTrainedTokenizer, - query: str, - history: List[Tuple[str, str]] = None, - system: str = "", - max_window_size: int = 6144, - chat_format: str = "chatml", -): - if history is None: - history = [] - - if chat_format == "chatml": - im_start, im_end = "<|im_start|>", "<|im_end|>" - im_start_tokens = [tokenizer.im_start_id] - im_end_tokens = [tokenizer.im_end_id] - nl_tokens = tokenizer.encode("\n") - - def _tokenize_str(role, content): - return f"{role}\n{content}", tokenizer.encode(role, allowed_special=set(tokenizer.IMAGE_ST)) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST)) - - system_text, system_tokens_part = _tokenize_str("system", system) - system_tokens = im_start_tokens + system_tokens_part + im_end_tokens - - raw_text = "" - context_tokens = [] - - for turn_query, turn_response in reversed(history): - query_text, query_tokens_part = _tokenize_str("user", turn_query) - query_tokens = im_start_tokens + query_tokens_part + im_end_tokens - if turn_response is not None: - response_text, response_tokens_part = _tokenize_str("assistant", turn_response) - response_tokens = im_start_tokens + response_tokens_part + im_end_tokens - - next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens - prev_chat = f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}" - else: - next_context_tokens = nl_tokens + query_tokens + nl_tokens - prev_chat = f"\n{im_start}{query_text}{im_end}\n" - - current_context_size = len(system_tokens) + len(next_context_tokens) + len(context_tokens) - if current_context_size < max_window_size: - context_tokens = next_context_tokens + context_tokens - raw_text = prev_chat + raw_text - else: - break - - context_tokens = system_tokens + context_tokens - raw_text = f"{im_start}{system_text}{im_end}" + raw_text - context_tokens += nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens + im_start_tokens + tokenizer.encode("assistant") + nl_tokens - raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" - - elif chat_format == "raw": - raw_text = query - context_tokens = tokenizer.encode(raw_text) - else: - raise NotImplementedError(f"Unknown chat format {chat_format!r}") - - return raw_text, context_tokens - - - -@register_model("cambrian_8b") -class Cambrian_8b(lmms): - """ - cambrian_8b model - https://huggingface.co/nyu-visionx/cambrian-8b - """ - - def __init__( - self, - pretrained: str = "/ML-A100/team/mm/zhangge/models/cambrian_8b", - # pretrained: str = "nyu-visionx/cambrian-8b", - device: Optional[str] = "cuda", - device_map="auto", - batch_size: Optional[Union[int, str]] = 1, - trust_remote_code: Optional[bool] = True, - use_cache=True, - **kwargs, - ) -> None: - super().__init__() - assert kwargs == {}, f"Unexpected kwargs: {kwargs}" - - - accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) - accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) - if accelerator.num_processes > 1 and device_map == "": - self._device = torch.device(f"cuda:{accelerator.local_process_index}") - self.device_map = f"cuda:{accelerator.local_process_index}" - else: - self._device = device - self.device_map = device_map - - model_name = pretrained # Assuming `pretrained` is a path or model id - # self._model = AutoModelForCausalLM.from_pretrained( - # pretrained, - # torch_dtype="auto", - # device_map=self.device_map - # ) - # self._model = AutoModel.from_pretrained(self.pretrained, device_map=self.device_map, trust_remote_code=True) - # self._model = AutoModelForCausalLM.from_pretrained(pretrained, device_map=self._device, trust_remote_code=trust_remote_code).eval() - # model_path = os.path.expanduser("nyu-visionx/cambrian-8b") - model_name = get_model_name_from_path(pretrained) - tokenizer, model, self.image_processor, context_len = load_pretrained_model(pretrained, None, model_name,device_map=self.device_map) - self._model = model - self._tokenizer = tokenizer - self.tokenizer.pad_token_id=self.tokenizer.eos_token_id - # device_map=self.device_map - # self._tokenizer = AutoTokenizer.from_pretrained(pretrained) - self._model.eval() - # self._config = self._model.config - # self.model.tie_weights() - self.batch_size_per_gpu = int(batch_size) - self.use_cache = use_cache - if accelerator.num_processes > 1: - assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." - # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model - # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works - # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. - if accelerator.distributed_type == DistributedType.DEEPSPEED: - kwargs = { - "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, - "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, - } - AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) - eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") - if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: - self._model = accelerator.prepare(self.model) - else: - self._model = accelerator.prepare_model(self.model, evaluation_mode=True) - self.accelerator = accelerator - if self.accelerator.is_local_main_process: - eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") - self._rank = self.accelerator.local_process_index - self._world_size = self.accelerator.num_processes - else: - self.model.to(self._device) - self._rank = 0 - self._word_size = 1 - # if accelerator.num_processes > 1: - # self._model = accelerator.prepare(self._model) - # else: - # self.model = torch.nn.DataParallel(self.model) - # self.model.to(self._device) - - self.accelerator = accelerator - - @property - def config(self): - # return the associated transformers.AutoConfig for the given pretrained model. - return self._config - - @property - def tokenizer(self): - return self._tokenizer - - @property - def model(self): - # returns the model, unwrapping it if using Accelerate - if hasattr(self, "accelerator"): - return self.accelerator.unwrap_model(self._model) - else: - return self._model - - @property - def eot_token_id(self): - # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* - return self.tokenizer.eos_token_id - - @property - def max_length(self): - return self._max_length - - # should be deleted since max_new_tokens is decided by gen_kwargs not a model property - # @property - # def max_new_tokens(self) -> int: - # return 256 - - @property - def batch_size(self): - return self.batch_size_per_gpu - - @property - def device(self): - return self._device - - @property - def rank(self): - return self._rank - - @property - def world_size(self): - return self._world_size - - def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: - res = [] - pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") - - for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: - # encode, pad, and truncate contexts for this batch - if type(doc_to_target) == str: - continuation = doc_to_target - else: - continuation = doc_to_target(self.task_dict[task][split][doc_id]) - visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] - visuals = self.flatten(visuals) - query = [] - visual_paths = [] - for visual in visuals: - name = uuid.uuid4().hex.upper()[0:6] - visual.save(f"/tmp/{name}.png") - visual_paths.append(f"/tmp/{name}.png") - query.append({"image": f"/tmp/{name}.png"}) - - # Make a copy for query to save context (text that needs to be masked) - context_query = [_ for _ in query] - context_query.append({"text": contexts}) - query.append({"text": contexts + continuation}) - - context_query = self.tokenizer.from_list_format(context_query) - query = self.tokenizer.from_list_format(query) - - raw_contxt_text, context_tokens = make_context( - self.tokenizer, context_query, history=None, system="You are a helpful assistant", max_window_size=self.model.generation_config.max_window_size, chat_format=self.model.generation_config.chat_format - ) - context_tokens = torch.tensor([context_tokens]) - - raw_continuation_text, continuation_tokens = make_context( - self.tokenizer, query, history=None, system="You are a helpful assistant", max_window_size=self.model.generation_config.max_window_size, chat_format=self.model.generation_config.chat_format - ) - continuation_tokens = torch.tensor([continuation_tokens]).to(self.model.device) - attn_mask = torch.ones_like(continuation_tokens).to(self.model.device) - labels = continuation_tokens.clone().to(self.model.device) - labels[:, : context_tokens.shape[1]] = -100 - with torch.inference_mode(): - outputs = self.model(input_ids=continuation_tokens, labels=labels, attention_mask=attn_mask) - loss = outputs.loss - logits = outputs["logits"] - greedy_tokens = logits.argmax(dim=-1) - cont_toks = continuation_tokens[:, context_tokens.shape[1] :] - greedy_tokens = greedy_tokens[:, context_tokens.shape[1] : continuation_tokens.shape[1]] # [1, seq] - max_equal = (greedy_tokens == cont_toks).all() - res.append((float(loss.item()), bool(max_equal))) - pbar.update(1) - - pbar.close() - return res - - def flatten(self, input): - new_list = [] - for i in input: - for j in i: - new_list.append(j) - return new_list - - def generate_until(self, requests: List[Instance]) -> List[str]: - res = [] - - def _collate(x): - # the negative sign on len(toks) sorts descending - this has a few advantages: - # - time estimates will always be over not underestimates, which is more useful for planning - # - to know the size of a batch when going through the list, you know the first one is always the batch - # padded context length. this is useful to simplify the batching logic and more importantly to make - # automatic adaptive batches much much easier to implement - # - any OOMs will happen right away rather than near the end - toks = self.tokenizer.encode(x[0]) - return -len(toks), x[0] - - pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") - # we group requests by their generation_kwargs, - # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling - # in the same batch. - re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) - chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) - for chunk in chunks: - contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) - task = task[0] - split = split[0] - visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] - visuals = self.flatten(visuals) - visual_paths = [] - # save images to /tmp, name generated by hash function - # qwen accept image path. Have to do it here.... - for visual in visuals: - name = uuid.uuid4().hex.upper()[0:6] - visual.save(f"/ML-A100/team/mm/zk/lmms-eval/lmms_eval/tmp/{name}.png") - visual_paths.append(f"/ML-A100/team/mm/zk/lmms-eval/lmms_eval/tmp/{name}.png") - - # we assume all gen kwargs in the batch are the same - # this is safe to assume because the `grouper` object ensures it. - gen_kwargs = all_gen_kwargs[0] - - # Set default values for until and max_new_tokens - # until = [self.tokenizer.decode(self.eot_token_id)] - - # # Update values from gen_kwargs if present - # if "until" in gen_kwargs: - # until = gen_kwargs.pop("until") - # if isinstance(until, str): - # until = [until] - # elif not isinstance(until, list): - # raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") - # preconfigure gen_kwargs with defaults - if "image_sizes" not in gen_kwargs: - try: - gen_kwargs["image_sizes"] = [visuals[0].size] - except: - gen_kwargs["image_sizes"] = None - if "max_new_tokens" not in gen_kwargs: - gen_kwargs["max_new_tokens"] = 1024 - if "temperature" not in gen_kwargs: - gen_kwargs["temperature"] = 0 - if "top_p" not in gen_kwargs: - gen_kwargs["top_p"] = None - if "num_beams" not in gen_kwargs: - gen_kwargs["num_beams"] = 1 - - # self.tokenizer.pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eod_id - # image_path = input("image path: ") - image = Image.open(visual_paths[0]).convert('RGB') - # question = input("question: ") - question=contexts[0] - # print(question) - input_ids, image_tensor, image_sizes, prompt = process(image, question, self.tokenizer, self.image_processor, self.model.config) - input_ids = input_ids.to(device='cuda', non_blocking=True) - with torch.inference_mode(): - output_ids = self.model.generate( - input_ids, - images=image_tensor, - image_sizes=image_sizes, - do_sample=True if temperature > 0 else False, - temperature=gen_kwargs["temperature"], - num_beams=gen_kwargs["num_beams"], - max_new_tokens=gen_kwargs["max_new_tokens"], - use_cache=True) - text_outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() - # cont_toks_list = cont.tolist() - # for cont_toks, context in zip(cont_toks_list, contexts): - # # discard context + left-padding toks if using causal decoder-only LMM - # cont_toks = cont_toks[input_ids.input_ids.shape[1] :] - # text_outputs = self.tokenizer.decode(cont_toks, skip_special_tokens=True).strip() - # for term in until: - # if len(term) > 0: - # # ignore '' separator, - # # for seq2seq case where self.tok_decode(self.eot_token_id) = '' - # text_outputs = text_outputs.split(term)[0] - - # res.append(text_outputs) - - # self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs) - # remove visuals from tmp - # print(text_outputs) - for visual_path in visual_paths: - try: - os.remove(visual_path) - except: - pass - output_text=[text_outputs] - res.extend(output_text) - pbar.update(1) - # reorder this group of results back to original unsorted form - # res = re_ords.get_original(res) - - - pbar.close() - return res diff --git a/lmms_eval/models/cambrian.py b/lmms_eval/models/cambrian_model.py similarity index 93% rename from lmms_eval/models/cambrian.py rename to lmms_eval/models/cambrian_model.py index 0085a27e6..4da0c5e1a 100644 --- a/lmms_eval/models/cambrian.py +++ b/lmms_eval/models/cambrian_model.py @@ -8,16 +8,6 @@ from PIL import Image from datetime import timedelta from lmms_eval.api.registry import register_model -# from lmms_eval.models.model_utils.qwen.qwen_generate_utils import make_context -# 获取当前文件的目录 -# current_dir = os.path.dirname(os.path.abspath(__file__)) -# # 获取同级目录的路径 -# parent_dir = os.path.dirname(current_dir) - -# # 将同级目录添加到模块搜索路径 -# sys.path.append(parent_dir) -# sys.path.append(current_dir) -# os.chdir('./lmms_eval/models/cambrian') from cambrian.model.builder import load_pretrained_model from cambrian.conversation import conv_templates, SeparatorStyle from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path @@ -131,7 +121,7 @@ def _tokenize_str(role, content): -@register_model("cambrian") +@register_model("cambrian_model") class Cambrian(lmms): """ cambrian_8b model @@ -140,8 +130,7 @@ class Cambrian(lmms): def __init__( self, - pretrained: str = "/ML-A100/team/mm/zhangge/models/cambrian_8b", - # pretrained: str = "nyu-visionx/cambrian-8b", + pretrained: str = "nyu-visionx/cambrian-8b", device: Optional[str] = "cuda", device_map="auto", batch_size: Optional[Union[int, str]] = 1, @@ -353,7 +342,7 @@ def _collate(x): for visual in visuals: name = uuid.uuid4().hex.upper()[0:6] visual.save(f"./lmms_eval/tmp/{name}.png") - visual_paths.append(f"./lmms_eval/tmp/{name}.png") + visual_paths.append(f"./lmms-eval/lmms_eval/tmp/{name}.png") # we assume all gen kwargs in the batch are the same # this is safe to assume because the `grouper` object ensures it. @@ -370,15 +359,15 @@ def _collate(x): elif not isinstance(until, list): raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") # preconfigure gen_kwargs with defaults - if "image_sizes" not in gen_kwargs: - try: - gen_kwargs["image_sizes"] = [visuals[0].size] - except: - gen_kwargs["image_sizes"] = None + # if "image_sizes" not in gen_kwargs: + # try: + # gen_kwargs["image_sizes"] = [visuals[0].size] + # except: + # gen_kwargs["image_sizes"] = None if "max_new_tokens" not in gen_kwargs: gen_kwargs["max_new_tokens"] = 1024 if "temperature" not in gen_kwargs: - gen_kwargs["temperature"] = 0 + gen_kwargs["temperature"]= 0 if "top_p" not in gen_kwargs: gen_kwargs["top_p"] = None if "num_beams" not in gen_kwargs: @@ -400,12 +389,13 @@ def _collate(x): input_ids, images=image_tensor, image_sizes=image_sizes, - do_sample=True if temperature > 0 else False, - temperature=gen_kwargs["temperature"], - num_beams=gen_kwargs["num_beams"], - max_new_tokens=gen_kwargs["max_new_tokens"], - # eos_token_id=eot_token_id, - use_cache=True) + **gen_kwargs) + # do_sample=True if temperature > 0 else False, + # temperature=gen_kwargs["temperature"], + # num_beams=gen_kwargs["num_beams"], + # max_new_tokens=gen_kwargs["max_new_tokens"], + # # eos_token_id=eot_token_id, + # use_cache=True) text_outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() # cont_toks_list = cont.tolist() diff --git a/lmms_eval/models/emu2.py b/lmms_eval/models/emu2.py index 4684599f4..fb77af853 100755 --- a/lmms_eval/models/emu2.py +++ b/lmms_eval/models/emu2.py @@ -15,10 +15,7 @@ from tqdm import tqdm from accelerate import Accelerator, DistributedType from accelerate.state import AcceleratorState -<<<<<<< HEAD from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch -======= ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e from loguru import logger as eval_logger @@ -33,10 +30,7 @@ def __init__( self, pretrained: str = "BAAI/Emu2", device: Optional[str] = "cuda", -<<<<<<< HEAD device_map='auto', -======= ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e max_new_tokens: int = 256, batch_size: Optional[Union[int, str]] = 1, **kwargs, @@ -51,7 +45,6 @@ def __init__( else: self._device = device -<<<<<<< HEAD self._model = AutoModelForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self._device, trust_remote_code=True) # with init_empty_weights(): # model = AutoModelForCausalLM.from_pretrained( @@ -68,9 +61,6 @@ def __init__( # model, # 'local/path/to/hf/version/Emu2-Chat/model', # device_map=device_map).eval() -======= - self._model = AutoModelForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device, trust_remote_code=True) ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e # self._model = None self.model.eval() self.model.tie_weights() @@ -204,15 +194,12 @@ def _collate(x): gen_kwargs = all_gen_kwargs[0] if "max_new_tokens" not in gen_kwargs: gen_kwargs["max_new_tokens"] = 1024 -<<<<<<< HEAD if "until" in gen_kwargs: until = gen_kwargs.pop("until") if isinstance(until, str): until = [until] elif not isinstance(until, list): raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") -======= ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk) visuals = [doc_to_visual(self.task_dict[task][split][ids]) for ids, task, split, doc_to_visual in zip(doc_id, tasks, splits, doc_to_visuals)] # print(visuals[0]) @@ -221,11 +208,6 @@ def _collate(x): # visuals = [visuals[idx][0] for idx in range(len(visuals))] # get the first image in multi-image scenarios. # assert len(contexts) == self.batch_size_per_gpu, f"Expected contexts batch size {self.batch_size_per_gpu}, got {len(contexts)}" # assert len(visuals) == self.batch_size_per_gpu, f"Expected visuals batch size {self.batch_size_per_gpu}, got {len(visuals)}" -<<<<<<< HEAD -======= - print('') - print(contexts) ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e formatted_contexts = [f"[]{context}" for context in contexts] formatted_contexts[0] = formatted_contexts[0].replace('.', ':') # print(formatted_contexts) @@ -233,12 +215,8 @@ def _collate(x): inputs = self.model.build_input_ids( text=formatted_contexts, tokenizer=self.tokenizer, -<<<<<<< HEAD image=visuals, # device=self.model.device, -======= - image=visuals ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e ) # inputs = self.model.build_input_ids( # text=formatted_contexts, @@ -249,21 +227,12 @@ def _collate(x): # ) outputs = self.model.generate( -<<<<<<< HEAD input_ids=inputs["input_ids"].to(self.device), attention_mask=inputs["attention_mask"].to(self.device), image=inputs["image"].to(self.device,torch.bfloat16), # max_new_tokens=gen_kwargs["max_new_tokens"], # length_penalty=-1 **gen_kwargs -======= - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"], - image=inputs["image"].to(torch.bfloat16), - max_new_tokens=gen_kwargs["max_new_tokens"], - length_penalty=-1 - # **gen_kwargs ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e ) output_text = self._tokenizer.batch_decode(outputs, skip_special_tokens=True) diff --git a/lmms_eval/models/gemma.py b/lmms_eval/models/gemma.py index c45cfd5af..93f41a563 100644 --- a/lmms_eval/models/gemma.py +++ b/lmms_eval/models/gemma.py @@ -30,7 +30,7 @@ class Gemma(lmms): """ def __init__( self, - pretrained: str = "/ML-A100/team/mm/zhangge/models/gemma-1.1-7b-it", + pretrained: str = "None", device: Optional[str] = "cuda", batch_size: Optional[Union[int, str]] = 1, device_map="auto", diff --git a/lmms_eval/models/gpt4v_01.py b/lmms_eval/models/gpt4v_01.py new file mode 100755 index 000000000..e65337679 --- /dev/null +++ b/lmms_eval/models/gpt4v_01.py @@ -0,0 +1,202 @@ +from io import BytesIO +from copy import deepcopy +import numpy as np +import os +import base64 +from typing import List, Tuple +from tqdm import tqdm +import requests as url_requests +import time + + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval import utils + +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState + +try: + from decord import VideoReader, cpu +except ImportError: + pass + +from PIL import Image + +API_TYPE = os.getenv("API_TYPE", "openai") +NUM_SECONDS_TO_SLEEP = 30 +from loguru import logger as eval_logger +from openai import OpenAI +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +@register_model("gpt4v_01") +class GPT4V_01(lmms): + def __init__( + self, + model_version: str = "gpt-4-vision-preview", + modality: str = "image", + max_frames_for_video: int = 10, + timeout: int = 120, + **kwargs, + ) -> None: + super().__init__() + # Manually set a image token for GPT4V so that we can search for it + # and split the text and image + # Here we just use the same token as llava for convenient + self.model_version = model_version + self.modality = modality + self.max_frames_for_video = max_frames_for_video + self.image_token = "" + self.timeout = timeout + + + accelerator = Accelerator() + # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue." + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.accelerator = accelerator + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + + self.device = self.accelerator.device + + # Function to encode the image + def encode_image(self, image: Image): + output_buffer = BytesIO() + image.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + return base64_str + + # Function to encode the video + def encode_video(self, video_path, for_get_frames_num): + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + frames = vr.get_batch(frame_idx).asnumpy() + + base64_frames = [] + for frame in frames: + img = Image.fromarray(frame) + output_buffer = BytesIO() + img.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + base64_frames.append(base64_str) + + return base64_frames + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + # visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = [doc_to_visual(self.task_dict[task][split][0])] + visuals = self.flatten(visuals) + imgs = [] # multiple images or frames for video + for visual in visuals: + if self.modality == "image": + img = self.encode_image(visual) + imgs.append(img) + elif self.modality == "video": + frames = self.encode_video(visual, self.max_frames_for_video) + imgs.extend(frames) + + payload = {"model": self.model_version, "messages": []} + response_json = {"role": "user", "content": []} + # When there is no image token in the context, append the image to the text + if self.image_token not in contexts: + payload["messages"].append(deepcopy(response_json)) + payload["messages"][0]["content"].append({"type": "text", "text": contexts}) + for img in imgs: + payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}}) + else: + contexts = contexts.split(self.image_token) + for idx, img in enumerate(imgs): + payload["messages"].append(deepcopy(response_json)) + payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]}) + payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}}) + + # If n image tokens are in the contexts + # contexts will be splitted into n+1 chunks + # Manually add it into the payload + payload["messages"].append(deepcopy(response_json)) + payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]}) + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + payload["max_tokens"] = gen_kwargs["max_new_tokens"] + payload["temperature"] = gen_kwargs["temperature"] + + for attempt in range(5): + try: + client = OpenAI( + base_url=API_URL, + api_key=API_KEY, + ) + # print(payload["messages"]) + response = client.chat.completions.create( + model=self.model_version, + messages=payload["messages"], + max_tokens=payload["max_tokens"], + # timeout=timeout, + temperature=payload["temperature"], + ) + + content = response.choices[0].message.content.strip() + break # If successful, break out of the loop + + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}") + if attempt < 5 - 1: # If we have retries left, sleep and then continue to next attempt + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}") + eval_logger.error(f"Response: {response}") + content = "" + res.append(content) + pbar.update(1) + pbar.close() + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "GPT4V not support" diff --git a/lmms_eval/models/gpt4v_01_batch.py b/lmms_eval/models/gpt4v_01_batch.py new file mode 100755 index 000000000..9246119fb --- /dev/null +++ b/lmms_eval/models/gpt4v_01_batch.py @@ -0,0 +1,224 @@ +from io import BytesIO +from copy import deepcopy +from concurrent.futures import ThreadPoolExecutor, as_completed +import numpy as np +import os +import base64 +from typing import List, Tuple +from tqdm import tqdm +import requests as url_requests +import time + + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval import utils + +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState + +try: + from decord import VideoReader, cpu +except ImportError: + pass + +from PIL import Image + +API_TYPE = os.getenv("API_TYPE", "openai") +NUM_SECONDS_TO_SLEEP = 30 +from loguru import logger as eval_logger +from openai import OpenAI +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +# def is_openai_v1() -> bool: +# from importlib.metadata import version +# from packaging.version import Version, parse +# _version = parse(version("openai")) +# return _version >= Version("1.0.0") + + +# if is_openai_v1(): +# API_URL = os.path.join(API_URL, "openai") + + + +@register_model("gpt4v_01_batch") +class GPT4V_01_batch(lmms): + def __init__( + self, + model_version: str = "gpt-4-vision-preview", + modality: str = "image", + max_frames_for_video: int = 10, + timeout: int = 120, + **kwargs, + ) -> None: + super().__init__() + # Manually set a image token for GPT4V so that we can search for it + # and split the text and image + # Here we just use the same token as llava for convenient + self.model_version = model_version + self.modality = modality + self.max_frames_for_video = max_frames_for_video + self.image_token = "" + self.timeout = timeout + + + accelerator = Accelerator() + # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue." + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.accelerator = accelerator + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + + self.device = self.accelerator.device + + # Function to encode the image + def encode_image(self, image: Image): + output_buffer = BytesIO() + image.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + return base64_str + + # Function to encode the video + def encode_video(self, video_path, for_get_frames_num): + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + frames = vr.get_batch(frame_idx).asnumpy() + + base64_frames = [] + for frame in frames: + img = Image.fromarray(frame) + output_buffer = BytesIO() + img.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + base64_frames.append(base64_str) + + return base64_frames + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def process_request(self, request, idx): + """处理单个请求""" + contexts, gen_kwargs, doc_to_visual, doc_id, task, split = request.args + + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + imgs = [] # multiple images or frames for video + + for visual in visuals: + if self.modality == "image": + img = self.encode_image(visual) + imgs.append(img) + elif self.modality == "video": + frames = self.encode_video(visual, self.max_frames_for_video) + imgs.extend(frames) + + payload = {"model": self.model_version, "messages": []} + response_json = {"role": "user", "content": []} + + if self.image_token not in contexts: + payload["messages"].append(deepcopy(response_json)) + payload["messages"][0]["content"].append({"type": "text", "text": contexts}) + for img in imgs: + payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}}) + else: + contexts = contexts.split(self.image_token) + for idx, img in enumerate(imgs): + payload["messages"].append(deepcopy(response_json)) + payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]}) + payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}}) + + payload["messages"].append(deepcopy(response_json)) + payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]}) + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + payload["max_tokens"] = gen_kwargs["max_new_tokens"] + payload["temperature"] = gen_kwargs["temperature"] + for attempt in range(5): + try: + client = OpenAI( + base_url=API_URL, + api_key=API_KEY, + ) + + response = client.chat.completions.create( + model=self.model_version, + messages=payload["messages"], + max_tokens=payload["max_tokens"], + temperature=payload["temperature"], + ) + + content = response.choices[0].message.content.strip() + return content # 成功时返回内容 + + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}") + if attempt < 5 - 1: # 如果还有重试机会 + time.sleep(NUM_SECONDS_TO_SLEEP) + else: + eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}") + return "" # 返回空字符串作为失败结果 + + def generate_until(self, requests) -> List[str]: + res = [None] * len(requests) # 使用 None 初始化结果数组,长度与请求列表相同 + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + NUM_WORKERS = 16 + with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: + future_to_request = {executor.submit(lambda reg=reg, idx=idx: self.process_request(reg, idx), reg): idx for idx, reg in enumerate(requests)} + + for future in as_completed(future_to_request): + idx = future_to_request[future] # 获取对应请求的索引 + try: + content = future.result() + except Exception as e: + eval_logger.error(f"Error processing request: {str(e)}") + content = "" + + res[idx] = content # 将结果放到对应的位置 + pbar.update(1) + + pbar.close() + return res # 返回结果数组 + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "GPT4V not support" diff --git a/lmms_eval/models/idefics2.py b/lmms_eval/models/idefics2.py index 0d73d2c95..1cb82f076 100644 --- a/lmms_eval/models/idefics2.py +++ b/lmms_eval/models/idefics2.py @@ -215,19 +215,11 @@ def _collate(x): prompt = self._processor.apply_chat_template(message, add_generation_prompt=True) prompts.append(prompt) -<<<<<<< HEAD # print(contexts) # print(prompts) # print(visuals) # input() # -======= - print(contexts) - print(prompts) - print(visuals) - input() - ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e inputs = self._processor(text=prompts, images=visuals, padding=True, return_tensors="pt") if 'max_new_tokens' not in gen_kwargs: gen_kwargs['max_new_tokens']=1024 diff --git a/lmms_eval/models/internvl2.py b/lmms_eval/models/internvl2.py index 5115c6974..94fa3de03 100644 --- a/lmms_eval/models/internvl2.py +++ b/lmms_eval/models/internvl2.py @@ -12,11 +12,7 @@ from lmms_eval.api.model import lmms from tqdm import tqdm import logging -<<<<<<< HEAD import math -======= - ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e eval_logger = logging.getLogger("eval_logger") IMAGENET_MEAN = (0.485, 0.456, 0.406) @@ -125,7 +121,6 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3 from accelerate.state import AcceleratorState from accelerate.utils import InitProcessGroupKwargs -<<<<<<< HEAD # def split_model(model_name): # device_map = {} # world_size = 8 @@ -150,8 +145,6 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3 # device_map['language_model.lm_head'] = 0 # device_map[f'language_model.model.layers.{num_layers - 1}'] = 0 # return device_map -======= ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e @register_model("internvl2") class InternVL2(lmms): @@ -173,11 +166,7 @@ def __init__( batch_size = int(batch_size) assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}." self.batch_size_per_gpu = batch_size -<<<<<<< HEAD model_key=self.path.split('/')[-1] -======= - ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) self.accelerator = accelerator diff --git a/lmms_eval/models/minicpm_v.py b/lmms_eval/models/minicpm_v.py index e117e7fad..26c48e800 100644 --- a/lmms_eval/models/minicpm_v.py +++ b/lmms_eval/models/minicpm_v.py @@ -180,13 +180,9 @@ def _collate(x): elif not isinstance(until, list): raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") assert self.batch_size_per_gpu == 1, "Do not support batch_size_per_gpu > 1 for now" -<<<<<<< HEAD # assert len(visuals) == 1, "MiniCPM_V interface does not support bn_image > 1 for now" if len(visuals)>1: visuals=visuals[:1]#debug use first image -======= - assert len(visuals) == 1, "MiniCPM_V interface does not support bn_image > 1 for now" ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e context = contexts[0] if "" in context: # minicpm does not expect the tag diff --git a/lmms_eval/models/paligemma.py b/lmms_eval/models/paligemma.py index 97e252085..1931264cc 100755 --- a/lmms_eval/models/paligemma.py +++ b/lmms_eval/models/paligemma.py @@ -4,11 +4,7 @@ warnings.filterwarnings("ignore") from accelerate import Accelerator, DistributedType -<<<<<<< HEAD from transformers import AutoProcessor, PaliGemmaForConditionalGeneration,AutoTokenizer -======= -from transformers import AutoProcessor, PaliGemmaForConditionalGeneration ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e from lmms_eval.api.model import lmms from lmms_eval.api.registry import register_model import torch @@ -47,7 +43,6 @@ def __init__( else: self._device = device -<<<<<<< HEAD # self._model = FuyuForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device) self._model = PaliGemmaForConditionalGeneration.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device) self.model.eval() @@ -57,15 +52,6 @@ def __init__( # self.image_processor = FuyuImageProcessor() self.processor = AutoProcessor.from_pretrained(pretrained) -======= - self._model = PaliGemmaForConditionalGeneration.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device, revision="bfloat16", trust_remote_code=True).eval() - # self._model = None - self.model.eval() - self.model.tie_weights() - self._tokenizer = AutoProcessor.from_pretrained(pretrained) - self._config = self.model.config - ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e self.max_new_tokens = max_new_tokens self.batch_size_per_gpu = int(batch_size) accelerator = Accelerator() @@ -166,15 +152,12 @@ def generate_until(self, requests: List[Instance]) -> List[str]: res = [] def _collate(x): -<<<<<<< HEAD # the negative sign on len(toks) sorts descending - this has a few advantages: # - time estimates will always be over not underestimates, which is more useful for planning # - to know the size of a batch when going through the list, you know the first one is always the batch # padded context length. this is useful to simplify the batching logic and more importantly to make # automatic adaptive batches much much easier to implement # - any OOMs will happen right away rather than near the end -======= ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e toks = self.tok_encode(x[0]) return -len(toks), x[0] @@ -184,7 +167,6 @@ def _collate(x): pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") for chunk in chunks: -<<<<<<< HEAD contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) task = task[0] split = split[0] @@ -245,50 +227,6 @@ def _collate(x): # app = [gen_text.split("\x04")[1].strip(" ").strip("\n") for gen_text in generation_texts] # print("输入:",formatted_contexts) # res.extend(response) -======= - # contexts, all_gen_kwargs, doc_to_visual, doc_id, tasks, split = zip(*chunk) - contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk) - gen_kwargs = all_gen_kwargs[0] - if "max_new_tokens" not in gen_kwargs: - gen_kwargs["max_new_tokens"] = 1024 - contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk) - visuals = [doc_to_visual(self.task_dict[task][split][ids]) for ids, task, split, doc_to_visual in zip(doc_id, tasks, splits, doc_to_visuals)] - # print(visuals[0]) - visuals = [v[0] for v in visuals] - - formatted_contexts = [context for context in contexts] - formatted_contexts[0] = formatted_contexts[0].replace('.', ':') - - # inputs = self.model.build_input_ids( - # text=formatted_contexts, - # tokenizer=self.tokenizer, - # image=visuals - # ) - - model_inputs = self.tokenizer(text=formatted_contexts, images=visuals, return_tensors="pt").to("cuda") - input_len = model_inputs["input_ids"].shape[-1] - # print('question is: ') - print(formatted_contexts) - generation = self.model.generate(**model_inputs, max_new_tokens = gen_kwargs['max_new_tokens'], do_sample=False) - generation = generation[0][input_len:] - decoded = self.tokenizer.decode(generation, skip_special_tokens=True) - # print(decoded) - - # outputs = self.tokenizer.generate( - # input_ids=inputs["input_ids"], - # attention_mask=inputs["attention_mask"], - # image=inputs["image"].to(torch.bfloat16), - # max_new_tokens=gen_kwargs["max_new_tokens"], - # length_penalty=-1 - # # **gen_kwargs - # ) - - - # output_text = self._tokenizer.batch_decode(outputs, skip_special_tokens=True) - # output_text = [t.strip(" ").strip("\n") for t in output_text] - # print(decoded) - res.extend([decoded]) ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e pbar.update(1) pbar.close() @@ -337,11 +275,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: """ """ add_special_tokens = False if add_special_tokens is None else add_special_tokens -<<<<<<< HEAD encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) -======= - encoding = self.tokenizer.tokenizer.encode(string, add_special_tokens=add_special_tokens) ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e # left-truncate the encoded context to be at most `left_truncate_len` tokens long if left_truncate_len: encoding = encoding[-left_truncate_len:] diff --git a/lmms_eval/models/qwen.py b/lmms_eval/models/qwen.py index 9e37c69b8..88f59562a 100644 --- a/lmms_eval/models/qwen.py +++ b/lmms_eval/models/qwen.py @@ -31,7 +31,7 @@ class Qwen(lmms): def __init__( self, - pretrained: str = "/ML-A100/team/mm/zhangge/models/Qwen2-72B-Instruct", + pretrained: str = None, device: Optional[str] = "cuda", batch_size: Optional[Union[int, str]] = 1, device_map="auto", diff --git a/lmms_eval/models/test_swwu.py b/lmms_eval/models/test_swwu.py deleted file mode 100644 index 1392c7894..000000000 --- a/lmms_eval/models/test_swwu.py +++ /dev/null @@ -1,78 +0,0 @@ -<<<<<<< HEAD -from PIL import Image -import requests -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - - -tokenizer = AutoTokenizer.from_pretrained("/ML-A100/team/mm/wangzekun/kangz/ViLLaMA/models/Emu2-Chat") - -model = AutoModelForCausalLM.from_pretrained( - "/ML-A100/team/mm/wangzekun/kangz/ViLLaMA/models/Emu2-Chat", - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, - trust_remote_code=True).to('cuda').eval() - - -# `[]` is the image placeholder which will be replaced by image embeddings. -# the number of `[]` should be equal to the number of input images - -query = 'Describe the image in details:' -image = Image.open('./image2.jpg').convert('RGB') - -print([query]) -print([image]) -inputs = model.build_input_ids( - text=[query], - tokenizer=tokenizer, - image=[image] -) - -with torch.no_grad(): - outputs = model.generate( - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"], - image=inputs["image"].to(torch.bfloat16), - max_new_tokens=64, - length_penalty=-1) - -output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True) - - -print(output_text) -======= -from transformers import AutoProcessor, PaliGemmaForConditionalGeneration -from PIL import Image -import requests -import torch - -model_id = "/ML-A100/team/mm/zhangge/models/paligemma-3b-pt-224" -device = "cuda:0" -dtype = torch.bfloat16 - - -model = PaliGemmaForConditionalGeneration.from_pretrained( - model_id, - torch_dtype=dtype, - device_map=device, - revision="bfloat16", -).eval() -processor = AutoProcessor.from_pretrained(model_id) - - -prompt = 'What is the difference of those two images:' -image = Image.open('./image2.jpg').convert('RGB') - -model_inputs = processor(text=[prompt], images=[image, image], return_tensors="pt").to(model.device) - -print(model_inputs) - -input() -input_len = model_inputs["input_ids"].shape[-1] - -with torch.inference_mode(): - generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False) - generation = generation[0][input_len:] - decoded = processor.decode(generation, skip_special_tokens=True) - print(decoded) ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e diff --git a/lmms_eval/models/yi.py b/lmms_eval/models/yi.py index b8fd2e0f2..ba08fd253 100644 --- a/lmms_eval/models/yi.py +++ b/lmms_eval/models/yi.py @@ -31,7 +31,7 @@ class Yi(lmms): def __init__( self, - pretrained: str = "/ML-A100/team/mm/zhangge/models/Yi-1.5-34B-Chat", + pretrained: str = "None", device: Optional[str] = "cuda", batch_size: Optional[Union[int, str]] = 1, device_map="auto", diff --git a/lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml b/lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml index dd2973d4f..c2e9d79b3 100755 --- a/lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml +++ b/lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-VD/ai2d -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-VD/ai2d ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e task: "ai2d_vd" dataset_kwargs: token: True diff --git a/lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml b/lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml index 07eee1bbc..f9069efca 100755 --- a/lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml +++ b/lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-jsonl/ai2d -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-jsonl/ai2d ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e task: "ai2d_suit" dataset_kwargs: token: True diff --git a/lmms_eval/tasks/chartqa_all_vd/chartqa.yaml b/lmms_eval/tasks/chartqa_all_vd/chartqa.yaml index f8d5300b0..5bb135f84 100755 --- a/lmms_eval/tasks/chartqa_all_vd/chartqa.yaml +++ b/lmms_eval/tasks/chartqa_all_vd/chartqa.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-VD/ChartQA/ -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-VD/ChartQA/ ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e dataset_kwargs: token: True task: "chartqa_vd" diff --git a/lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml b/lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml index 8bee1b819..0471b0c2b 100755 --- a/lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml +++ b/lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-selected-VD/ChartQA/ -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-selected-VD/ChartQA/ ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e dataset_kwargs: token: True task: "chartqa_suit_vd" diff --git a/lmms_eval/tasks/mmmu/mmmu_aug.yaml b/lmms_eval/tasks/mmmu/mmmu_aug.yaml index 492c8f108..d335d0896 100755 --- a/lmms_eval/tasks/mmmu/mmmu_aug.yaml +++ b/lmms_eval/tasks/mmmu/mmmu_aug.yaml @@ -1,4 +1,4 @@ -dataset_path: /ML-A100/team/mm/zhangge/MMMU/right_and_error/reorder/MMMU_Dataset/mmmu_aug_dataset +dataset_path: ./MMMU/right_and_error/reorder/MMMU_Dataset/mmmu_aug_dataset task: "mmmu_aug" test_split: train output_type: generate_until diff --git a/lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml b/lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml index 7902ea848..cc754d478 100644 --- a/lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml +++ b/lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml @@ -1,4 +1,4 @@ -dataset_path: /ML-A100/team/mm/zhangge/MMMU_V/verify_interface_vision/annotate_stage2/processed_jsonl/shuffled.jsonl +dataset_path: ./MMMU_V/verify_interface_vision/annotate_stage2/processed_jsonl/shuffled.jsonl task: "mmmu_aug_group_img" test_split: augument output_type: generate_until diff --git a/lmms_eval/tasks/ocrbench_all_vd/ocrbench_suit.yaml b/lmms_eval/tasks/ocrbench_all_vd/ocrbench_suit.yaml index 1d860e5b7..28a878752 100644 --- a/lmms_eval/tasks/ocrbench_all_vd/ocrbench_suit.yaml +++ b/lmms_eval/tasks/ocrbench_all_vd/ocrbench_suit.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-VD/ocrbench -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-VD/ocrbench ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e dataset_kwargs: token: True task: "ocrbench_vd" diff --git a/lmms_eval/tasks/ocrbench_suit/ocrbench_suit.yaml b/lmms_eval/tasks/ocrbench_suit/ocrbench_suit.yaml index d6b69e93b..3e6286b7e 100644 --- a/lmms_eval/tasks/ocrbench_suit/ocrbench_suit.yaml +++ b/lmms_eval/tasks/ocrbench_suit/ocrbench_suit.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-jsonl/ocrbench -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-jsonl/ocrbench ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e dataset_kwargs: token: True task: "ocrbench_suit" diff --git a/lmms_eval/tasks/ocrbench_suit_vd/ocrbench_suit.yaml b/lmms_eval/tasks/ocrbench_suit_vd/ocrbench_suit.yaml index 034a4275b..b6403c659 100644 --- a/lmms_eval/tasks/ocrbench_suit_vd/ocrbench_suit.yaml +++ b/lmms_eval/tasks/ocrbench_suit_vd/ocrbench_suit.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-selected-VD/ocrbench -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-selected-VD/ocrbench ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e dataset_kwargs: token: True task: "ocrbench_suit_vd" diff --git a/lmms_eval/tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml b/lmms_eval/tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml index 31b76aec1..da9dad0e7 100755 --- a/lmms_eval/tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml +++ b/lmms_eval/tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-VD/OK-VQA/ -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-VD/OK-VQA/ ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e group: ok_vqa_vd task: ok_vqa_val2014_vd test_split: train diff --git a/lmms_eval/tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml b/lmms_eval/tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml index c2bdb617b..32902a0ef 100755 --- a/lmms_eval/tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml +++ b/lmms_eval/tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD dataset_path: ./vlms-bench-data-selected-VD/OK-VQA/ -======= -dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-selected-VD/OK-VQA/ ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e group: ok_vqa_suit_vd task: ok_vqa_suit_val2014_vd test_split: train diff --git a/lmms_eval/tasks/siwei_bench_sub1/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub1/siwei_bench.yaml deleted file mode 100755 index 3f4d60b3a..000000000 --- a/lmms_eval/tasks/siwei_bench_sub1/siwei_bench.yaml +++ /dev/null @@ -1,3 +0,0 @@ -group: siwei_bench_sub1 -task: -- siwei_bench_layout diff --git a/lmms_eval/tasks/siwei_bench_sub1/siwei_bench_layout.yaml b/lmms_eval/tasks/siwei_bench_sub1/siwei_bench_layout.yaml deleted file mode 100644 index de87975fc..000000000 --- a/lmms_eval/tasks/siwei_bench_sub1/siwei_bench_layout.yaml +++ /dev/null @@ -1,22 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Layout -task: "siwei_bench_layout" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub1/utils.py b/lmms_eval/tasks/siwei_bench_sub1/utils.py deleted file mode 100755 index 88c35ffbe..000000000 --- a/lmms_eval/tasks/siwei_bench_sub1/utils.py +++ /dev/null @@ -1,96 +0,0 @@ -import json - -import re -from collections import Counter -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -from PIL import Image -import base64 -from io import BytesIO -from loguru import logger -import statistics - -PROMPT = 'You will be giving one question, two images, and four answers, one of them is correct. Please choose one of the three answers.\ - please only answer the question with A, B, C.\ - questions: {question} \ - answer: A: {A} B: {B} C: {C}\ - Your answer is ' - - -def siwei_bench_doc_to_text(doc): - question=PROMPT.format(question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C']) - # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]) - # pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - # post_prompt = model_specific_prompt_kwargs["post_prompt"] - # return f"{pre_prompt}{question}{post_prompt}" - return question - - -# def siwei_bench_doc_to_visual(doc): -# return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")] -def base64_to_pil_image(base64_string): - img_bytes = base64.b64decode(base64_string) - - buffered = BytesIO(img_bytes) - - image = Image.open(buffered) - # image.save('temp.png') - return image - -def siwei_bench_doc_to_visual(doc): - # prompt = construct_prompt(doc) - # image_tokens = re.findall(r"", prompt) - # # Remove <> and swap space as _ - # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] - visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")] ######################################### load image from base64 encoding - return visual - - -def extract_option_labels(text, options=None): - if isinstance(text, dict): - return "error" - pattern = r"\(([A-C])\)" - matches = re.findall(pattern, text) - - if not matches: - pattern = r"\b([A-C])\b" - matches = re.findall(pattern, text) - - if matches: - counter = Counter(matches) - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - else: - if options: - counter = Counter() - for i, option in enumerate(options, start=1): - label = chr(64 + i) - option_stripped = option.strip() - if option_stripped in text: - counter[label] += 1 - elif text in option: - counter[label] += 1 - if counter: - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - return None - - -def siwei_bench_process_results(doc, results): - response = results[0] - predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C']]) - if doc['answer']==predict: - accuracy=1.0 - else: - accuracy=0.0 - return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}} - - -def siwei_bench_aggregate_submissions(results, args): - file = generate_submission_file("siwei_bench_test_for_submission.json", args) - with open(file, "w") as f: - json.dump(results, f, indent=4) - logger.info(f"Results saved to {file}") diff --git a/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench.yaml deleted file mode 100755 index d817255bd..000000000 --- a/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench.yaml +++ /dev/null @@ -1,3 +0,0 @@ -group: siwei_bench_sub1_vd -task: -- siwei_bench_layout_vd diff --git a/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench_layout.yaml b/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench_layout.yaml deleted file mode 100644 index d3abfaaab..000000000 --- a/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench_layout.yaml +++ /dev/null @@ -1,26 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Layout -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/Layout ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_layout_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub1_vd/utils.py b/lmms_eval/tasks/siwei_bench_sub1_vd/utils.py deleted file mode 100755 index b5254dbb6..000000000 --- a/lmms_eval/tasks/siwei_bench_sub1_vd/utils.py +++ /dev/null @@ -1,97 +0,0 @@ -import json - -import re -from collections import Counter -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -from PIL import Image -import base64 -from io import BytesIO -from loguru import logger -import statistics - -PROMPT = 'You will be giving one question, two images , descriptions of each images, and three answers, one of them is correct. Please choose one of the three answers.\ - please only answer the question with A, B, C.\ - description of image1:{image1},description of image2:{image2},\ - questions: {question} \ - answer: A: {A} B: {B} C: {C}\ - Your answer is ' - - -def siwei_bench_doc_to_text(doc): - question=PROMPT.format(image1=doc['image1_VD'],image2=doc['image2_VD'],question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C']) - # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]) - # pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - # post_prompt = model_specific_prompt_kwargs["post_prompt"] - # return f"{pre_prompt}{question}{post_prompt}" - return question - - -# def siwei_bench_doc_to_visual(doc): -# return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")] -def base64_to_pil_image(base64_string): - img_bytes = base64.b64decode(base64_string) - - buffered = BytesIO(img_bytes) - - image = Image.open(buffered) - # image.save('temp.png') - return image - -def siwei_bench_doc_to_visual(doc): - # prompt = construct_prompt(doc) - # image_tokens = re.findall(r"", prompt) - # # Remove <> and swap space as _ - # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] - visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")] ######################################### load image from base64 encoding - return visual - - -def extract_option_labels(text, options=None): - if isinstance(text, dict): - return "error" - pattern = r"\(([A-C])\)" - matches = re.findall(pattern, text) - - if not matches: - pattern = r"\b([A-C])\b" - matches = re.findall(pattern, text) - - if matches: - counter = Counter(matches) - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - else: - if options: - counter = Counter() - for i, option in enumerate(options, start=1): - label = chr(64 + i) - option_stripped = option.strip() - if option_stripped in text: - counter[label] += 1 - elif text in option: - counter[label] += 1 - if counter: - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - return None - - -def siwei_bench_process_results(doc, results): - response = results[0] - predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C']]) - if doc['answer']==predict: - accuracy=1.0 - else: - accuracy=0.0 - return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}} - - -def siwei_bench_aggregate_submissions(results, args): - file = generate_submission_file("siwei_bench_test_for_submission.json", args) - with open(file, "w") as f: - json.dump(results, f, indent=4) - logger.info(f"Results saved to {file}") diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench.yaml deleted file mode 100755 index 63776cc64..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench.yaml +++ /dev/null @@ -1,9 +0,0 @@ -group: siwei_bench_sub2 -task: -- siwei_bench_atlocation -- siwei_bench_environment -- siwei_bench_madeof -- siwei_bench_nearby -- siwei_bench_partof -- siwei_bench_Used_For - diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Environment.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Environment.yaml deleted file mode 100755 index cddb5c637..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Environment.yaml +++ /dev/null @@ -1,26 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Environment -task: "siwei_bench_environment" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_MadeOf.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_MadeOf.yaml deleted file mode 100755 index e7f9e5275..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_MadeOf.yaml +++ /dev/null @@ -1,26 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/MadeOf -task: "siwei_bench_madeof" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Used_For.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Used_For.yaml deleted file mode 100755 index 6cb7d6fda..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Used_For.yaml +++ /dev/null @@ -1,26 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Used_For -task: "siwei_bench_Used_For" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_atlocation.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_atlocation.yaml deleted file mode 100755 index 5599f63c1..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_atlocation.yaml +++ /dev/null @@ -1,26 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/AtLocation -task: "siwei_bench_atlocation" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_nearby.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_nearby.yaml deleted file mode 100755 index 3519d3d22..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_nearby.yaml +++ /dev/null @@ -1,26 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/NearBy -task: "siwei_bench_nearby" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_partof.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_partof.yaml deleted file mode 100755 index d0c9f4e41..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_partof.yaml +++ /dev/null @@ -1,26 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/PartOf -task: "siwei_bench_partof" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2/utils.py b/lmms_eval/tasks/siwei_bench_sub2/utils.py deleted file mode 100755 index 1ad5e31cb..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2/utils.py +++ /dev/null @@ -1,96 +0,0 @@ -import json - -import re -from collections import Counter -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -from PIL import Image -import base64 -from io import BytesIO -from loguru import logger -import statistics - -PROMPT = 'You will be giving one question, two images, and four answers, one of them is correct. Please choose one of the four answers.\ - please only answer the question with A, B, C or D.\ - questions: {question} \ - answer: A: {A} B: {B} C: {C} D:{D}\ - Your answer is ' - - -def siwei_bench_doc_to_text(doc): - question=PROMPT.format(question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C'],D=doc['options']['D']) - # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]) - # pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - # post_prompt = model_specific_prompt_kwargs["post_prompt"] - # return f"{pre_prompt}{question}{post_prompt}" - return question - - -# def siwei_bench_doc_to_visual(doc): -# return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")] -def base64_to_pil_image(base64_string): - img_bytes = base64.b64decode(base64_string) - - buffered = BytesIO(img_bytes) - - image = Image.open(buffered) - # image.save('temp.png') - return image - -def siwei_bench_doc_to_visual(doc): - # prompt = construct_prompt(doc) - # image_tokens = re.findall(r"", prompt) - # # Remove <> and swap space as _ - # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] - visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")] ######################################### load image from base64 encoding - return visual - - -def extract_option_labels(text, options=None): - if isinstance(text, dict): - return "error" - pattern = r"\(([A-D])\)" - matches = re.findall(pattern, text) - - if not matches: - pattern = r"\b([A-D])\b" - matches = re.findall(pattern, text) - - if matches: - counter = Counter(matches) - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - else: - if options: - counter = Counter() - for i, option in enumerate(options, start=1): - label = chr(64 + i) - option_stripped = option.strip() - if option_stripped in text: - counter[label] += 1 - elif text in option: - counter[label] += 1 - if counter: - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - return None - - -def siwei_bench_process_results(doc, results): - response = results[0] - predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']]) - if doc['answer']==predict: - accuracy=1.0 - else: - accuracy=0.0 - return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}} - - -def siwei_bench_aggregate_submissions(results, args): - file = generate_submission_file("siwei_bench_test_for_submission.json", args) - with open(file, "w") as f: - json.dump(results, f, indent=4) - logger.info(f"Results saved to {file}") diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench.yaml deleted file mode 100755 index 76edb2c46..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench.yaml +++ /dev/null @@ -1,7 +0,0 @@ -group: siwei_bench_sub2_shuffle -task: -- siwei_bench_atlocation_shuffle -- siwei_bench_madeof_shuffle -- siwei_bench_nearby_shuffle -- siwei_bench_partof_shuffle - diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Environment.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Environment.yaml deleted file mode 100755 index 022295afe..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Environment.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/Environment -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/Environment ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_environment" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_MadeOf.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_MadeOf.yaml deleted file mode 100755 index aca4ddfb5..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_MadeOf.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/MadeOf -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/MadeOf ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_madeof_shuffle" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Used_For.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Used_For.yaml deleted file mode 100755 index 7c6e6de9b..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Used_For.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/Used_For -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/Used_For ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_Used_For" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_atlocation.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_atlocation.yaml deleted file mode 100755 index f81efdf64..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_atlocation.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/AtLocation -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/AtLocation ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_atlocation_shuffle" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_nearby.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_nearby.yaml deleted file mode 100755 index 22775fcab..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_nearby.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/NearBy -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/NearBy ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_nearby_shuffle" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_partof.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_partof.yaml deleted file mode 100755 index abe151c5a..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_partof.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/PartOf -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/PartOf ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_partof_shuffle" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/utils.py b/lmms_eval/tasks/siwei_bench_sub2_shuffle/utils.py deleted file mode 100755 index 84d8a355a..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/utils.py +++ /dev/null @@ -1,96 +0,0 @@ -import json - -import re -from collections import Counter -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -from PIL import Image -import base64 -from io import BytesIO -from loguru import logger -import statistics - -PROMPT = 'You will be giving one question, two images, and five answers, one of them is correct. Please choose one of the five answers.\ - please only answer the question with A, B, C, D or E.\ - questions: {question} \ - answer: A: {A} B: {B} C: {C} D:{D} E:{E}\ - Your answer is ' - - -def siwei_bench_doc_to_text(doc): - question=PROMPT.format(question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C'],D=doc['options']['D'],E=doc['options']['E']) - # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]) - # pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - # post_prompt = model_specific_prompt_kwargs["post_prompt"] - # return f"{pre_prompt}{question}{post_prompt}" - return question - - -# def siwei_bench_doc_to_visual(doc): -# return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")] -def base64_to_pil_image(base64_string): - img_bytes = base64.b64decode(base64_string) - - buffered = BytesIO(img_bytes) - - image = Image.open(buffered) - # image.save('temp.png') - return image - -def siwei_bench_doc_to_visual(doc): - # prompt = construct_prompt(doc) - # image_tokens = re.findall(r"", prompt) - # # Remove <> and swap space as _ - # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] - visual = [base64_to_pil_image(doc['image2']).convert("RGB"),base64_to_pil_image(doc['image1']).convert("RGB")] ######################################### load image from base64 encoding - return visual - - -def extract_option_labels(text, options=None): - if isinstance(text, dict): - return "error" - pattern = r"\(([A-E])\)" - matches = re.findall(pattern, text) - - if not matches: - pattern = r"\b([A-E])\b" - matches = re.findall(pattern, text) - - if matches: - counter = Counter(matches) - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - else: - if options: - counter = Counter() - for i, option in enumerate(options, start=1): - label = chr(64 + i) - option_stripped = option.strip() - if option_stripped in text: - counter[label] += 1 - elif text in option: - counter[label] += 1 - if counter: - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - return None - - -def siwei_bench_process_results(doc, results): - response = results[0] - predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D'],doc['options']['E']]) - if doc['last_answer']==predict: - accuracy=1.0 - else: - accuracy=0.0 - return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}} - - -def siwei_bench_aggregate_submissions(results, args): - file = generate_submission_file("siwei_bench_test_for_submission.json", args) - with open(file, "w") as f: - json.dump(results, f, indent=4) - logger.info(f"Results saved to {file}") diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench.yaml deleted file mode 100755 index 337456876..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench.yaml +++ /dev/null @@ -1,9 +0,0 @@ -group: siwei_bench_sub2_vd -task: -- siwei_bench_atlocation_vd -- siwei_bench_environment_vd -- siwei_bench_madeof_vd -- siwei_bench_nearby_vd -- siwei_bench_partof_vd -- siwei_bench_Used_For_vd - diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Environment.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Environment.yaml deleted file mode 100755 index 003175b65..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Environment.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Environment -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/Environment ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_environment_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_MadeOf.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_MadeOf.yaml deleted file mode 100755 index 2841565cf..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_MadeOf.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/MadeOf -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/MadeOf ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_madeof_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Used_For.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Used_For.yaml deleted file mode 100755 index dda863758..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Used_For.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Used_For -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/Used_For ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_Used_For_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_atlocation.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_atlocation.yaml deleted file mode 100755 index 7e044a54b..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_atlocation.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/AtLocation -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/AtLocation ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_atlocation_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_nearby.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_nearby.yaml deleted file mode 100755 index 5f92e3a20..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_nearby.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/NearBy -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/NearBy ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_nearby_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_partof.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_partof.yaml deleted file mode 100755 index 689290d44..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_partof.yaml +++ /dev/null @@ -1,30 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/PartOf -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/PartOf ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_partof_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - metric: submission - aggregation: !function utils.siwei_bench_aggregate_submissions - higher_is_better: true -# metric_list: - -# - metric: submission -# aggregation: !function utils.siwei_bench_aggregate_submissions - diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/utils.py b/lmms_eval/tasks/siwei_bench_sub2_vd/utils.py deleted file mode 100755 index e4db4ac1b..000000000 --- a/lmms_eval/tasks/siwei_bench_sub2_vd/utils.py +++ /dev/null @@ -1,97 +0,0 @@ -import json - -import re -from collections import Counter -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -from PIL import Image -import base64 -from io import BytesIO -from loguru import logger -import statistics - -PROMPT = 'You will be giving one question, two images, descriptions of each images, and four answers, one of them is correct. Please choose one of the four answers.\ - please only answer the question with A, B, C or D.\ - description of image1:{image1},description of image2:{image2},\ - questions: {question} \ - answer: A: {A} B: {B} C: {C} D:{D}\ - Your answer is ' - - -def siwei_bench_doc_to_text(doc): - question=PROMPT.format(image1=doc['image1_VD'],image2=doc['image2_VD'],question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C'],D=doc['options']['D']) - # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]) - # pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - # post_prompt = model_specific_prompt_kwargs["post_prompt"] - # return f"{pre_prompt}{question}{post_prompt}" - return question - - -# def siwei_bench_doc_to_visual(doc): -# return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")] -def base64_to_pil_image(base64_string): - img_bytes = base64.b64decode(base64_string) - - buffered = BytesIO(img_bytes) - - image = Image.open(buffered) - # image.save('temp.png') - return image - -def siwei_bench_doc_to_visual(doc): - # prompt = construct_prompt(doc) - # image_tokens = re.findall(r"", prompt) - # # Remove <> and swap space as _ - # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] - visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")] ######################################### load image from base64 encoding - return visual - - -def extract_option_labels(text, options=None): - if isinstance(text, dict): - return "error" - pattern = r"\(([A-D])\)" - matches = re.findall(pattern, text) - - if not matches: - pattern = r"\b([A-D])\b" - matches = re.findall(pattern, text) - - if matches: - counter = Counter(matches) - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - else: - if options: - counter = Counter() - for i, option in enumerate(options, start=1): - label = chr(64 + i) - option_stripped = option.strip() - if option_stripped in text: - counter[label] += 1 - elif text in option: - counter[label] += 1 - if counter: - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - return None - - -def siwei_bench_process_results(doc, results): - response = results[0] - predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']]) - if doc['answer']==predict: - accuracy=1.0 - else: - accuracy=0.0 - return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}} - - -def siwei_bench_aggregate_submissions(results, args): - file = generate_submission_file("siwei_bench_test_for_submission.json", args) - with open(file, "w") as f: - json.dump(results, f, indent=4) - logger.info(f"Results saved to {file}") diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench.yaml deleted file mode 100755 index e48c71e16..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench.yaml +++ /dev/null @@ -1,7 +0,0 @@ -group: siwei_bench_sub3 -task: -- siwei_bench_shapesimilarto -- siwei_bench_subevent -- siwei_bench_similar_event -- siwei_bench_hasproperty - diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_hasproperty.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_hasproperty.yaml deleted file mode 100644 index d79677074..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_hasproperty.yaml +++ /dev/null @@ -1,36 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty -task: "siwei_bench_hasproperty" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 128 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - # - metric: submission - # aggregation: !function utils.siwei_bench_aggregate_submissions - # higher_is_better: true - # - metric: siwei_bench_precision - # aggregation: !function utils.siwei_bench_aggregate_precision - # higher_is_better: true - # - metric: siwei_bench_recall - # aggregation: !function utils.siwei_bench_aggregate_recall - # higher_is_better: true - # - metric: siwei_bench_f1_score - # aggregation: !function utils.siwei_bench_aggregate_f1_score - # higher_is_better: true - # - metric: siwei_bench_yes_ratio - # aggregation: !function utils.siwei_bench_aggregate_yes_ratio - # higher_is_better: true -# metric_list: \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_shapesimilarto.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_shapesimilarto.yaml deleted file mode 100755 index 2373c9b83..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_shapesimilarto.yaml +++ /dev/null @@ -1,20 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/ShapeSimilarTo -task: "siwei_bench_shapesimilarto" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 128 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_similar_event.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_similar_event.yaml deleted file mode 100644 index 95a79417c..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_similar_event.yaml +++ /dev/null @@ -1,20 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent -task: "siwei_bench_similar_event" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 128 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_subevent.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_subevent.yaml deleted file mode 100644 index 3d12437a3..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_subevent.yaml +++ /dev/null @@ -1,20 +0,0 @@ -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SubEvent -task: "siwei_bench_subevent" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 128 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3/utils.py b/lmms_eval/tasks/siwei_bench_sub3/utils.py deleted file mode 100755 index abbd6d187..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3/utils.py +++ /dev/null @@ -1,138 +0,0 @@ -import json - -import re -from collections import Counter -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -from PIL import Image -import base64 -from io import BytesIO -from loguru import logger -import statistics -import re - -def remove_punctuation(text): - # 定义正则表达式模式以匹配所有标点符号 - pattern = r'[^\w\s]' - # 使用正则表达式替换标点符号为空字符串 - return re.sub(pattern, '', text) - -PROMPT = 'You will be giving one question and two images. Please answer the question using "Yes" or "No". \ - Please only answer the question with Yes or No.\ - questions: {question} \ - Your answer is ' - - -def siwei_bench_doc_to_text(doc): - question=PROMPT.format(question=doc['question']) - # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]) - # pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - # post_prompt = model_specific_prompt_kwargs["post_prompt"] - # return f"{pre_prompt}{question}{post_prompt}" - return question - - -# def siwei_bench_doc_to_visual(doc): -# return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")] -def base64_to_pil_image(base64_string): - img_bytes = base64.b64decode(base64_string) - - buffered = BytesIO(img_bytes) - - image = Image.open(buffered) - # image.save('temp.png') - return image - -def siwei_bench_doc_to_visual(doc): - # prompt = construct_prompt(doc) - # image_tokens = re.findall(r"", prompt) - # # Remove <> and swap space as _ - # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] - visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")] ######################################### load image from base64 encoding - return visual -def extract_yes_no(response): - # 定义正则表达式模式,匹配 "yes" 或 "no" - pattern = r'\b(yes|no)\b' - # 使用正则表达式搜索response中的匹配项 - matches = re.findall(pattern, response, re.IGNORECASE) - if matches: - counter = Counter(matches) - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - # 返回匹配项列表 - return None - - - -def siwei_bench_process_results(doc, results): - response = remove_punctuation(results[0]) - pred = response.lower().strip() - gt_ans = doc["answer"].lower().strip() - # idx=doc["idx"] - assert gt_ans in ["yes", "no"] - if pred not in ["yes", "no"]: - pred=extract_yes_no(pred) - score = 1.0 if pred == gt_ans else 0.0 - # predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']]) - # if doc['answer']==predict: - # accuracy=1.0 - # else: - # accuracy=0.0 - return {"exact_match": score,"submission": {"id": doc["idx"], "predict_answer": pred, "response": response}} - - -# def siwei_bench_aggregate_accuracy(results): -# total_score = 0 -# for result in results: -# total_score += result["score"] -# avg_score = total_score / len(results) -# return avg_score - - -# def siwei_bench_aggregate_precision(results): -# true_positives = 0 -# false_positives = 0 -# for result in results: -# pred = result["prediction"] -# gt = result["ground_truth"] -# if gt == "yes" and pred == "yes": -# true_positives += 1 -# elif gt == "no" and pred == "yes": -# false_positives += 1 -# precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 -# return precision - - -# def siwei_bench_aggregate_recall(results): -# true_positives = 0 -# false_negatives = 0 -# for result in results: -# pred = result["prediction"] -# gt = result["ground_truth"] -# if gt == "yes" and pred == "yes": -# true_positives += 1 -# elif gt == "yes" and pred == "no": -# false_negatives += 1 -# recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 -# return recall - - -# def siwei_bench_aggregate_f1_score(results): -# precision = pope_aggregate_precision(results) -# recall = pope_aggregate_recall(results) -# f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 -# return f1_score - - -# def siwei_bench_aggregate_yes_ratio(results): -# yes_count = 0 -# no_count = 0 -# for result in results: -# gt = result["ground_truth"] -# if gt == "yes": -# yes_count += 1 -# elif gt == "no": -# no_count += 1 -# yes_ratio = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0 -# return yes_ratio diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench.yaml deleted file mode 100755 index 6581d4f02..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench.yaml +++ /dev/null @@ -1,5 +0,0 @@ -group: siwei_bench_sub3_shuffle -task: -- siwei_bench_shapesimilarto_shuffle -- siwei_bench_subevent_shuffle - diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_hasproperty.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_hasproperty.yaml deleted file mode 100644 index c128da9c7..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_hasproperty.yaml +++ /dev/null @@ -1,38 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_hasproperty" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - # - metric: submission - # aggregation: !function utils.siwei_bench_aggregate_submissions - # higher_is_better: true - # - metric: siwei_bench_precision - # aggregation: !function utils.siwei_bench_aggregate_precision - # higher_is_better: true - # - metric: siwei_bench_recall - # aggregation: !function utils.siwei_bench_aggregate_recall - # higher_is_better: true - # - metric: siwei_bench_f1_score - # aggregation: !function utils.siwei_bench_aggregate_f1_score - # higher_is_better: true - # - metric: siwei_bench_yes_ratio - # aggregation: !function utils.siwei_bench_aggregate_yes_ratio - # higher_is_better: true -# metric_list: \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_shapesimilarto.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_shapesimilarto.yaml deleted file mode 100755 index dbc4762dd..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_shapesimilarto.yaml +++ /dev/null @@ -1,22 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/ShapeSimilarTo -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/ShapeSimilarTo ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_shapesimilarto_shuffle" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_similar_event.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_similar_event.yaml deleted file mode 100644 index d0abdad5c..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_similar_event.yaml +++ /dev/null @@ -1,22 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_similar_event" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_subevent.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_subevent.yaml deleted file mode 100644 index 461fbe6bb..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_subevent.yaml +++ /dev/null @@ -1,24 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/SubEvent -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/SubEvent ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_subevent_shuffle" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 128 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/utils.py b/lmms_eval/tasks/siwei_bench_sub3_shuffle/utils.py deleted file mode 100755 index 6f528feb3..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/utils.py +++ /dev/null @@ -1,138 +0,0 @@ -import json - -import re -from collections import Counter -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -from PIL import Image -import base64 -from io import BytesIO -from loguru import logger -import statistics -import re - -def remove_punctuation(text): - # 定义正则表达式模式以匹配所有标点符号 - pattern = r'[^\w\s]' - # 使用正则表达式替换标点符号为空字符串 - return re.sub(pattern, '', text) - -PROMPT = 'You will be giving one question and two images. Please answer the question using "Yes" or "No". \ - Please only answer the question with Yes or No.\ - questions: {question} \ - Your answer is ' - - -def siwei_bench_doc_to_text(doc): - question=PROMPT.format(question=doc['question']) - # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]) - # pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - # post_prompt = model_specific_prompt_kwargs["post_prompt"] - # return f"{pre_prompt}{question}{post_prompt}" - return question - - -# def siwei_bench_doc_to_visual(doc): -# return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")] -def base64_to_pil_image(base64_string): - img_bytes = base64.b64decode(base64_string) - - buffered = BytesIO(img_bytes) - - image = Image.open(buffered) - # image.save('temp.png') - return image - -def siwei_bench_doc_to_visual(doc): - # prompt = construct_prompt(doc) - # image_tokens = re.findall(r"", prompt) - # # Remove <> and swap space as _ - # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] - visual = [base64_to_pil_image(doc['image2']).convert("RGB"),base64_to_pil_image(doc['image1']).convert("RGB")] ######################################### load image from base64 encoding - return visual -def extract_yes_no(response): - # 定义正则表达式模式,匹配 "yes" 或 "no" - pattern = r'\b(yes|no)\b' - # 使用正则表达式搜索response中的匹配项 - matches = re.findall(pattern, response, re.IGNORECASE) - if matches: - counter = Counter(matches) - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - # 返回匹配项列表 - return None - - - -def siwei_bench_process_results(doc, results): - response = remove_punctuation(results[0]) - pred = response.lower().strip() - gt_ans = doc["last_answer"].lower().strip() - # idx=doc["idx"] - assert gt_ans in ["yes", "no"] - if pred not in ["yes", "no"]: - pred=extract_yes_no(pred) - score = 1.0 if pred == gt_ans else 0.0 - # predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']]) - # if doc['answer']==predict: - # accuracy=1.0 - # else: - # accuracy=0.0 - return {"exact_match": score,"submission": {"id": doc["idx"], "predict_answer": pred, "response": response}} - - -# def siwei_bench_aggregate_accuracy(results): -# total_score = 0 -# for result in results: -# total_score += result["score"] -# avg_score = total_score / len(results) -# return avg_score - - -# def siwei_bench_aggregate_precision(results): -# true_positives = 0 -# false_positives = 0 -# for result in results: -# pred = result["prediction"] -# gt = result["ground_truth"] -# if gt == "yes" and pred == "yes": -# true_positives += 1 -# elif gt == "no" and pred == "yes": -# false_positives += 1 -# precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 -# return precision - - -# def siwei_bench_aggregate_recall(results): -# true_positives = 0 -# false_negatives = 0 -# for result in results: -# pred = result["prediction"] -# gt = result["ground_truth"] -# if gt == "yes" and pred == "yes": -# true_positives += 1 -# elif gt == "yes" and pred == "no": -# false_negatives += 1 -# recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 -# return recall - - -# def siwei_bench_aggregate_f1_score(results): -# precision = pope_aggregate_precision(results) -# recall = pope_aggregate_recall(results) -# f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 -# return f1_score - - -# def siwei_bench_aggregate_yes_ratio(results): -# yes_count = 0 -# no_count = 0 -# for result in results: -# gt = result["ground_truth"] -# if gt == "yes": -# yes_count += 1 -# elif gt == "no": -# no_count += 1 -# yes_ratio = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0 -# return yes_ratio diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench.yaml deleted file mode 100755 index 2d2bf6c2e..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench.yaml +++ /dev/null @@ -1,7 +0,0 @@ -group: siwei_bench_sub3_vd -task: -- siwei_bench_shapesimilarto_vd -- siwei_bench_subevent_vd -- siwei_bench_similar_event_vd -- siwei_bench_hasproperty_vd - diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_hasproperty.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_hasproperty.yaml deleted file mode 100644 index 813fdfa34..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_hasproperty.yaml +++ /dev/null @@ -1,38 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_hasproperty_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - # - metric: submission - # aggregation: !function utils.siwei_bench_aggregate_submissions - # higher_is_better: true - # - metric: siwei_bench_precision - # aggregation: !function utils.siwei_bench_aggregate_precision - # higher_is_better: true - # - metric: siwei_bench_recall - # aggregation: !function utils.siwei_bench_aggregate_recall - # higher_is_better: true - # - metric: siwei_bench_f1_score - # aggregation: !function utils.siwei_bench_aggregate_f1_score - # higher_is_better: true - # - metric: siwei_bench_yes_ratio - # aggregation: !function utils.siwei_bench_aggregate_yes_ratio - # higher_is_better: true -# metric_list: \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_shapesimilarto.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_shapesimilarto.yaml deleted file mode 100755 index a7ac8049d..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_shapesimilarto.yaml +++ /dev/null @@ -1,22 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/ShapeSimilarTo -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/ShapeSimilarTo ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_shapesimilarto_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_similar_event.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_similar_event.yaml deleted file mode 100644 index bb6c0fbbc..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_similar_event.yaml +++ /dev/null @@ -1,22 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_similar_event_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 32 - temperature: 0 - do_sample: False -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_subevent.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_subevent.yaml deleted file mode 100644 index 8d4535189..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_subevent.yaml +++ /dev/null @@ -1,24 +0,0 @@ -<<<<<<< HEAD -dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SubEvent -======= -dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/SubEvent ->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e -task: "siwei_bench_subevent_vd" -test_split: train -output_type: generate_until -doc_to_visual: !function utils.siwei_bench_doc_to_visual -doc_to_text: !function utils.siwei_bench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 128 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -process_results: !function utils.siwei_bench_process_results -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true \ No newline at end of file diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/utils.py b/lmms_eval/tasks/siwei_bench_sub3_vd/utils.py deleted file mode 100755 index 14928b7cc..000000000 --- a/lmms_eval/tasks/siwei_bench_sub3_vd/utils.py +++ /dev/null @@ -1,140 +0,0 @@ -import json - -import re -from collections import Counter -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -from PIL import Image -import base64 -from io import BytesIO -from loguru import logger -import statistics -import re - -def remove_punctuation(text): - # 定义正则表达式模式以匹配所有标点符号 - pattern = r'[^\w\s]' - # 使用正则表达式替换标点符号为空字符串 - return re.sub(pattern, '', text) - -PROMPT = 'You will be giving one question , two images and descriptions of each images, Please answer the question using "Yes" or "No". \ - Please only answer the question with Yes or No.\ - description of image1:{image1},description of image2:{image2},\ - questions: {question} \ - Your answer is ' - - -def siwei_bench_doc_to_text(doc): - question=PROMPT.format(image1=doc['image1_VD'],image2=doc['image2_VD'],question=doc['question']) - # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]) - # pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - # post_prompt = model_specific_prompt_kwargs["post_prompt"] - # return f"{pre_prompt}{question}{post_prompt}" - return question - - -# def siwei_bench_doc_to_visual(doc): -# return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")] -def base64_to_pil_image(base64_string): - img_bytes = base64.b64decode(base64_string) - - buffered = BytesIO(img_bytes) - - image = Image.open(buffered) - # image.save('temp.png') - return image - -def siwei_bench_doc_to_visual(doc): - # prompt = construct_prompt(doc) - # image_tokens = re.findall(r"", prompt) - # # Remove <> and swap space as _ - # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens] - visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")] ######################################### load image from base64 encoding - return visual -def extract_yes_no(response): - # 定义正则表达式模式,匹配 "yes" 或 "no" - pattern = r'\b(yes|no)\b' - # 使用正则表达式搜索response中的匹配项 - matches = re.findall(pattern, response, re.IGNORECASE) - if matches: - counter = Counter(matches) - most_common = counter.most_common() - max_count = most_common[0][1] - candidates = [item for item in most_common if item[1] == max_count] - return candidates[-1][0] - - # 返回匹配项列表 - return None - - - -def siwei_bench_process_results(doc, results): - response = remove_punctuation(results[0]) - pred = response.lower().strip() - gt_ans = doc["answer"].lower().strip() - # idx=doc["idx"] - assert gt_ans in ["yes", "no"] - if pred not in ["yes", "no"]: - pred=extract_yes_no(pred) - score = 1.0 if pred == gt_ans else 0.0 - # predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']]) - # if doc['answer']==predict: - # accuracy=1.0 - # else: - # accuracy=0.0 - return {"exact_match": score,"submission": {"id": doc["idx"], "predict_answer": pred, "response": response}} - - -# def siwei_bench_aggregate_accuracy(results): -# total_score = 0 -# for result in results: -# total_score += result["score"] -# avg_score = total_score / len(results) -# return avg_score - - -# def siwei_bench_aggregate_precision(results): -# true_positives = 0 -# false_positives = 0 -# for result in results: -# pred = result["prediction"] -# gt = result["ground_truth"] -# if gt == "yes" and pred == "yes": -# true_positives += 1 -# elif gt == "no" and pred == "yes": -# false_positives += 1 -# precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 -# return precision - - -# def siwei_bench_aggregate_recall(results): -# true_positives = 0 -# false_negatives = 0 -# for result in results: -# pred = result["prediction"] -# gt = result["ground_truth"] -# if gt == "yes" and pred == "yes": -# true_positives += 1 -# elif gt == "yes" and pred == "no": -# false_negatives += 1 -# recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 -# return recall - - -# def siwei_bench_aggregate_f1_score(results): -# precision = pope_aggregate_precision(results) -# recall = pope_aggregate_recall(results) -# f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 -# return f1_score - - -# def siwei_bench_aggregate_yes_ratio(results): -# yes_count = 0 -# no_count = 0 -# for result in results: -# gt = result["ground_truth"] -# if gt == "yes": -# yes_count += 1 -# elif gt == "no": -# no_count += 1 -# yes_ratio = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0 -# return yes_ratio diff --git a/test.py b/test.py deleted file mode 100644 index 58cdc3aa9..000000000 --- a/test.py +++ /dev/null @@ -1,6 +0,0 @@ -import json -with open('/ML-A100/team/mm/zhangge/domain_data_pipeline/llm_label_data_pipeline/fasttext_seed_data/chemistry/pos/pos.jsonl') as jsonl_file: - for line in jsonl_file: - data=json.loads(line) - print(data) - break \ No newline at end of file diff --git a/test_blip.py b/test_blip.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_cambrian.py b/test_cambrian.py deleted file mode 100644 index d8409d25b..000000000 --- a/test_cambrian.py +++ /dev/null @@ -1,8 +0,0 @@ -from lmms_eval.models.cambrian_8b import * -from cambrian.model.builder import load_pretrained_model -from cambrian.conversation import conv_templates, SeparatorStyle -from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -pretrained='/ML-A100/team/mm/zhangge/models/cambrian_8b' -model_name = get_model_name_from_path(pretrained) -print(model_name) -tokenizer, model, image_processor, context_len = load_pretrained_model(pretrained, None, model_name) \ No newline at end of file diff --git a/visual_code/echart.js b/visual_code/echart.js new file mode 100644 index 000000000..05e06c29a --- /dev/null +++ b/visual_code/echart.js @@ -0,0 +1,145 @@ +var data = [ + { + "name": "Descriptive", + "children": [ + { + "name": "General Description", + "value": 142 + }, + { + "name": "Activity Description", + "value": 1 + } + ] + }, + { + "name": "Analytical", + "children": [ + { + "name": "Data Analysis", + "value": 197 + }, + { + "name": "Cultural Analysis", + "value": 2 + }, + { + "name": "Analytical - Data Analysis", + "value": 1 + }, + { + "name": "Attribute-based Question Answer", + "value": 2 + } + ] + }, + { + "name": "Recognition", + "children": [ + { + "name": "Object Recognition", + "value": 339 + }, + { + "name": "Location Identification", + "value": 2 + }, + { + "name": "Text Recognition", + "value": 89 + }, + 1 + ] + }, + { + "name": "Instructive", + "children": [ + { + "name": "How-to Guides", + "value": 16 + } + ] + }, + { + "name": "Comprehensive", + "children": [ + { + "name": "Cultural Analysis", + "value": 7 + } + ] + }, + { + "name": "Question", + "children": [ + { + "name": "Recognition", + "value": 1 + } + ] + }, + { + "name": "Instructional", + "children": [ + { + "name": "Math Problem Solving", + "value": 1 + } + ] + } +]; +option = { + title: { + text: 'WORLD COFFEE RESEARCH SENSORY LEXICON', + subtext: 'Source: https://worldcoffeeresearch.org/work/sensory-lexicon/', + textStyle: { + fontSize: 10, + align: 'center' + }, + subtextStyle: { + align: 'center' + }, + sublink: 'https://worldcoffeeresearch.org/work/sensory-lexicon/' + }, + series: { + type: 'sunburst', + data: data, + radius: [0, '95%'], + sort: undefined, + emphasis: { + focus: 'ancestor' + }, + levels: [ + {}, + { + r0: '15%', + r: '35%', + itemStyle: { + borderWidth: 2 + }, + label: { + rotate: 'tangential' + } + }, + { + r0: '35%', + r: '70%', + label: { + align: 'right' + } + }, + { + r0: '70%', + r: '72%', + label: { + position: 'outside', + padding: 3, + silent: false + }, + itemStyle: { + borderWidth: 3 + } + } + ] + } +}; \ No newline at end of file diff --git a/visual_code/echart_new.js b/visual_code/echart_new.js new file mode 100644 index 000000000..18c7d5294 --- /dev/null +++ b/visual_code/echart_new.js @@ -0,0 +1,283 @@ +var data = [ + { + "name": "Descriptive", + "itemStyle": { + "color": 'rgba(44, 157, 143)' // 基准颜色:蓝色 + }, + "label": { + "fontSize": 16 // 设置字体大小 + }, + "children": [ + { + "name": "General Description", + "value": 146, + "itemStyle": { + "color": 'rgba(108, 186, 177)' // 渐变:较深的颜色 + + }, + "label": { + 'position': 'inside', + 'rotate': 'tangential', + "fontSize": 16, // 设置字体大小 + "color": '#FFFFFF' // 设置字体颜色为白色 + + } + }, + { + "name": "Activity Description", + "value": 6, + "itemStyle": { + "color":'rgba(108, 186, 177)',// 渐变:较浅的颜色 + }, + label: { + align: 'right', + "fontSize": 10, + "color": '#FFFFFF' // 设置字体颜色为白色 + + }, + } + ] + }, + { + "name": "Analytical", + "itemStyle": { + "color": 'rgba(243, 162, 98)' // 基准颜色:蓝绿色 + }, + "label": { + "fontSize": 16 // 设置字体大小 + }, + "children": [ + { + "name": "Data Analysis", + "value": 197, + "itemStyle": { + "color": 'rgba(246, 190, 146)' // 较深的蓝绿色 + }, + "label": { + 'position': 'inside', + 'rotate': 'tangential', + "fontSize": 16 // 设置字体大小 + } + }, + { + "name": "Cultural Analysis", + "value": 6, + "itemStyle": { + "color": 'rgba(246, 190, 146)' // 较浅的蓝绿色 + }, + label: { + align: 'right', + "fontSize": 8, + }, + }, + { + "name": "Analytical - Data Analysis", + "value": 7, + "itemStyle": { + "color": 'rgba(246, 190, 146)' // 更浅的蓝绿色 + }, + label: { + align: 'right', + "fontSize": 8, + }, + }, + { + "name": "Attribute-based Question Answer", + "value": 6, + "itemStyle": { + "color": 'rgba(246, 190, 146)' // 最浅的蓝绿色 + }, + label: { + align: 'right', + "fontSize": 8, + }, + } + ] + }, + { + "name": "Recognition", + "itemStyle": { + "color":'rgba(232, 195, 107)' // 基准颜色:绿色 + }, + "label": { + "fontSize": 16 // 设置字体大小 + }, + "children": [ + { + "name": "Object Recognition", + "value": 339, + "itemStyle": { + "color": 'rgba(237, 213, 151)' // 较深的绿色 + }, + "label": { + 'position': 'inside', + 'rotate': 'tangential', + "fontSize": 16 // 设置字体大小 + } + }, + { + "name": "Location Identification", + "value": 6, + "itemStyle": { + "color": 'rgba(237, 213, 151)' // 较浅的绿色 + }, + label: { + align: 'right', + "fontSize": 10, + }, + }, + { + "name": "Text Recognition", + "value": 89, + "itemStyle": { + "color": 'rgba(237, 213, 151)' // 更浅的绿色 + }, + "label": { + 'position': 'inside', + 'rotate': 'tangential', + "fontSize": 16 // 设置字体大小 + } + }, + { + "name": "Other", + "value": 15, + "itemStyle": { + "color": 'rgba(237, 213, 151)' // 最浅的绿色 + }, + label: { + align: 'right', + "fontSize": 16, + }, + } + ] + }, + { + "name": "Instructive", + "itemStyle": { + "color": '#AEC48F' // 基准颜色:浅绿色 + }, + label: { + align: 'right' + }, + "children": [ + { + "name": "How-to Guides", + "value": 16, + "itemStyle": { + "color": 'rgba(176, 196, 145, 0.8)' // 较深的浅绿色 + }, + label: { + align: 'right', + "fontSize": 16 + }, + } + ] + }, + { + "name": "Comprehensive", + "itemStyle": { + "color": '#FFDB5C' // 基准颜色:黄色 + }, + label: { + align: 'right', + "fontSize": 9 + }, + "children": [ + { + "name": "Cultural Analysis", + "value": 7, + "itemStyle": { + "color": 'rgba(255, 219, 96, 0.8)' // 较深的黄色 + }, + label: { + align: 'right', + "fontSize": 14 + }, + } + ] + }, + { + "name": "Question", + "itemStyle": { + "color": '#F98862' // 基准颜色:橙色 + }, + label: { + align: 'right', + "fontSize": 8 + }, + + "children": [ + { + "name": "Recognition", + "value": 6, + "itemStyle": { + "color": 'rgba(249, 136, 98, 0.8)' // 渐变橙色 + }, + label: { + align: 'right', + "fontSize": 12 + }, + } + ] + }, + { + "name": "Instructional", + "itemStyle": { + "color": '#E84A5F' // 基准颜色:红色 + }, + label: { + align: 'right', + "fontSize": 8, + "color": '#000000' // 设置字体颜色为黑色 + + }, + "children": [ + { + "name": "Math Problem Solving", + "value": 6, + "itemStyle": { + "color": 'rgba(236, 76, 95, 0.8)' // 渐变红色 + }, + label: { + align: 'right', + "fontSize": 8, + "color": '#000000' // 设置字体颜色为黑色 + + }, + } + ] + } +]; + +option = { + series: { + type: 'sunburst', + data: data, + radius: [0, '100%'], + label: { + rotate: 'radial', + fontSize: 12, // 统一字体大小 + fontFamily: 'Arial', // 统一字体样式 + }, + levels: [ + {}, + { + r0: '0%', + r: '50%', + itemStyle: { + borderWidth: 2 + }, + label: { + // align: 'right' + } + }, + { + r0: '50%', + r: '100%', + label: { + // align: 'right' + } + } + ] + } +}; \ No newline at end of file diff --git a/visual_code/plot_map.py b/visual_code/plot_map.py new file mode 100644 index 000000000..5b539b2dd --- /dev/null +++ b/visual_code/plot_map.py @@ -0,0 +1,108 @@ +import json +from collections import defaultdict +import numpy as np +from scipy.interpolate import make_interp_spline +import matplotlib.pyplot as plt +from matplotlib import cm +from matplotlib.colors import LinearSegmentedColormap +# 加载 JSON 数据 +with open('/gpfs/public/research/zk/lmms-eval/result/category_result.json', 'r') as f: + data = json.load(f) +curve_color = (128/255, 0/255, 128/255) +# 构建存储分类和子分类的字典 +category_data = defaultdict(lambda: defaultdict(int)) + +# 遍历数据,统计 question_category 和 question_subcategory 的分布 +for entry in data: + category = entry["question_category"] + subcategory = entry["question_subcategory"] + category_data[category][subcategory] += 1 + +# Flattening the two-dimensional dictionary to a list of tuples (category, subcategory, count) +flat_data = [] +for category, subcategories in category_data.items(): + for subcategory, count in subcategories.items(): + if '(' in subcategory: + subcategory=subcategory.split('(')[0] + flat_data.append((category, subcategory, count)) + +# Sorting the flat data by count in descending order +flat_data_sorted = sorted(flat_data, key=lambda x: x[2], reverse=True) + +# Extracting the subcategory names and counts +subcategory_names = [f"{item[1]}" for item in flat_data_sorted] +counts = [item[2] for item in flat_data_sorted] + +# Calculate total count to convert frequencies to probabilities +total_count = sum(counts) +probabilities = [count / total_count for count in counts] # Convert to probabilities + +# Prepare x and y values +x = np.arange(len(probabilities)) +y = np.array(probabilities) + +# Generate a smooth curve using spline interpolation +x_smooth = np.linspace(x.min(), x.max(), 300) # Create 300 points between min and max of x +spl = make_interp_spline(x, y, k=3) # Spline of degree 3 for smooth curve +y_smooth = spl(x_smooth) + +# Calculate cumulative probabilities +cumulative_probabilities = np.cumsum(probabilities) +cumulative_prob_smooth = make_interp_spline(x, cumulative_probabilities, k=3)(x_smooth) + +# Define a custom blue color gradient that starts from dark blue to light blue +custom_blue_cmap = LinearSegmentedColormap.from_list("custom_blue", [(0, 0, 0.5), (0.5, 0.75, 1)], N=256) + +# Normalize the cumulative probabilities to get a gradient that moves from deep to light +norm = plt.Normalize(vmin=cumulative_prob_smooth.min(), vmax=cumulative_prob_smooth.max()) + +# Generate gradient color for each point based on the cumulative probability using the custom colormap +colors = custom_blue_cmap(norm(cumulative_prob_smooth)) +# Plotting the smooth probability distribution curve with gradient based on cumulative values +plt.figure(figsize=(12, 6)) + +# Plot the smooth curve with the updated curve color +plt.plot(x_smooth, y_smooth, color='black', linewidth=2) + +# Fill the area under the curve with a color gradient from deep to light based on cumulative probabilities +for i in range(len(x_smooth) - 1): + plt.fill_between(x_smooth[i:i+2], y_smooth[i:i+2], color=colors[i], alpha=0.8) + +# Enhancing ICLR-like style +plt.grid(True, which='both', axis='x', linestyle='--', linewidth=0.5) # Horizontal gridlines only +plt.xticks(x, subcategory_names, rotation=45, ha='right', fontsize=10,fontweight='bold') # X-axis labels angled and aligned +# plt.title("Smoothed Probability Distribution with Deep-to-Light Gradient", fontsize=14) +# plt.xlabel("Category - Subcategory", fontsize=12) +plt.ylabel("Probability", fontsize=12) + +# Remove the vertical grid lines (data lines along the vertical axis) +plt.grid(False, axis='y') + +# Adjust the curve to start at the y-axis +plt.xlim(left=0) # Start from y-axis (x=0) +plt.tight_layout() + +# Display the plot +plt.show() +plt.savefig('/gpfs/public/research/zk/lmms-eval/static_figs/statics_fig.png') +plt.savefig('/gpfs/public/research/zk/lmms-eval/static_figs/statics_fig.pdf') +# print(category_data) +# 构建旭日图需要的格式 +# def create_node(name, value=None, color=None, children=None): +# node = {"name": name} +# if value: +# node["value"] = value +# if color: +# node["itemStyle"] = {"color": color} +# if children: +# node["children"] = children +# return node + +# # 生成旭日图数据 +# sunburst_data = [] +# for category, subcategories in category_data.items(): +# children = [create_node(subcat, value=count) for subcat, count in subcategories.items()] +# sunburst_data.append(create_node(category, children=children)) + +# # 输出旭日图数据 +# print(json.dumps(sunburst_data, indent=2))