From 8e21e95b8c7fd3d368335d74b2fa56b89872be26 Mon Sep 17 00:00:00 2001
From: evergreen-bupt <609531932@qq.com>
Date: Mon, 7 Oct 2024 02:24:39 +0800
Subject: [PATCH] update 10.7 pure

---
 .gitignore                                    |   5 +
 lmms_eval/models/__init__.py                  |   9 +-
 lmms_eval/models/cambrian_8b.py               | 438 ------------------
 .../models/{cambrian.py => cambrian_model.py} |  42 +-
 lmms_eval/models/emu2.py                      |  31 --
 lmms_eval/models/gemma.py                     |   2 +-
 lmms_eval/models/gpt4v_01.py                  | 202 ++++++++
 lmms_eval/models/gpt4v_01_batch.py            | 224 +++++++++
 lmms_eval/models/idefics2.py                  |   8 -
 lmms_eval/models/internvl2.py                 |  11 -
 lmms_eval/models/minicpm_v.py                 |   4 -
 lmms_eval/models/paligemma.py                 |  66 ---
 lmms_eval/models/qwen.py                      |   2 +-
 lmms_eval/models/test_swwu.py                 |  78 ----
 lmms_eval/models/yi.py                        |   2 +-
 lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml     |   4 -
 lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml      |   4 -
 lmms_eval/tasks/chartqa_all_vd/chartqa.yaml   |   4 -
 lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml  |   4 -
 lmms_eval/tasks/mmmu/mmmu_aug.yaml            |   2 +-
 lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml  |   2 +-
 .../tasks/ocrbench_all_vd/ocrbench_suit.yaml  |   4 -
 .../tasks/ocrbench_suit/ocrbench_suit.yaml    |   4 -
 .../tasks/ocrbench_suit_vd/ocrbench_suit.yaml |   4 -
 .../tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml   |   4 -
 .../tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml  |   4 -
 .../tasks/siwei_bench_sub1/siwei_bench.yaml   |   3 -
 .../siwei_bench_sub1/siwei_bench_layout.yaml  |  22 -
 lmms_eval/tasks/siwei_bench_sub1/utils.py     |  96 ----
 .../siwei_bench_sub1_vd/siwei_bench.yaml      |   3 -
 .../siwei_bench_layout.yaml                   |  26 --
 lmms_eval/tasks/siwei_bench_sub1_vd/utils.py  |  97 ----
 .../tasks/siwei_bench_sub2/siwei_bench.yaml   |   9 -
 .../siwei_bench_Environment.yaml              |  26 --
 .../siwei_bench_sub2/siwei_bench_MadeOf.yaml  |  26 --
 .../siwei_bench_Used_For.yaml                 |  26 --
 .../siwei_bench_atlocation.yaml               |  26 --
 .../siwei_bench_sub2/siwei_bench_nearby.yaml  |  26 --
 .../siwei_bench_sub2/siwei_bench_partof.yaml  |  26 --
 lmms_eval/tasks/siwei_bench_sub2/utils.py     |  96 ----
 .../siwei_bench_sub2_shuffle/siwei_bench.yaml |   7 -
 .../siwei_bench_Environment.yaml              |  30 --
 .../siwei_bench_MadeOf.yaml                   |  30 --
 .../siwei_bench_Used_For.yaml                 |  30 --
 .../siwei_bench_atlocation.yaml               |  30 --
 .../siwei_bench_nearby.yaml                   |  30 --
 .../siwei_bench_partof.yaml                   |  30 --
 .../tasks/siwei_bench_sub2_shuffle/utils.py   |  96 ----
 .../siwei_bench_sub2_vd/siwei_bench.yaml      |   9 -
 .../siwei_bench_Environment.yaml              |  30 --
 .../siwei_bench_MadeOf.yaml                   |  30 --
 .../siwei_bench_Used_For.yaml                 |  30 --
 .../siwei_bench_atlocation.yaml               |  30 --
 .../siwei_bench_nearby.yaml                   |  30 --
 .../siwei_bench_partof.yaml                   |  30 --
 lmms_eval/tasks/siwei_bench_sub2_vd/utils.py  |  97 ----
 .../tasks/siwei_bench_sub3/siwei_bench.yaml   |   7 -
 .../siwei_bench_hasproperty.yaml              |  36 --
 .../siwei_bench_shapesimilarto.yaml           |  20 -
 .../siwei_bench_similar_event.yaml            |  20 -
 .../siwei_bench_subevent.yaml                 |  20 -
 lmms_eval/tasks/siwei_bench_sub3/utils.py     | 138 ------
 .../siwei_bench_sub3_shuffle/siwei_bench.yaml |   5 -
 .../siwei_bench_hasproperty.yaml              |  38 --
 .../siwei_bench_shapesimilarto.yaml           |  22 -
 .../siwei_bench_similar_event.yaml            |  22 -
 .../siwei_bench_subevent.yaml                 |  24 -
 .../tasks/siwei_bench_sub3_shuffle/utils.py   | 138 ------
 .../siwei_bench_sub3_vd/siwei_bench.yaml      |   7 -
 .../siwei_bench_hasproperty.yaml              |  38 --
 .../siwei_bench_shapesimilarto.yaml           |  22 -
 .../siwei_bench_similar_event.yaml            |  22 -
 .../siwei_bench_subevent.yaml                 |  24 -
 lmms_eval/tasks/siwei_bench_sub3_vd/utils.py  | 140 ------
 test.py                                       |   6 -
 test_blip.py                                  |   0
 test_cambrian.py                              |   8 -
 visual_code/echart.js                         | 145 ++++++
 visual_code/echart_new.js                     | 283 +++++++++++
 visual_code/plot_map.py                       | 108 +++++
 80 files changed, 989 insertions(+), 2545 deletions(-)
 delete mode 100644 lmms_eval/models/cambrian_8b.py
 rename lmms_eval/models/{cambrian.py => cambrian_model.py} (93%)
 create mode 100755 lmms_eval/models/gpt4v_01.py
 create mode 100755 lmms_eval/models/gpt4v_01_batch.py
 delete mode 100644 lmms_eval/models/test_swwu.py
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub1/siwei_bench.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub1/siwei_bench_layout.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub1/utils.py
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench_layout.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub1_vd/utils.py
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Environment.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_MadeOf.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Used_For.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_atlocation.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_nearby.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/siwei_bench_partof.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2/utils.py
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Environment.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_MadeOf.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Used_For.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_atlocation.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_nearby.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_partof.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_shuffle/utils.py
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Environment.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_MadeOf.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Used_For.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_atlocation.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_nearby.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_partof.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub2_vd/utils.py
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3/siwei_bench.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3/siwei_bench_hasproperty.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3/siwei_bench_shapesimilarto.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3/siwei_bench_similar_event.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3/siwei_bench_subevent.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3/utils.py
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_hasproperty.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_shapesimilarto.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_similar_event.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_subevent.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_shuffle/utils.py
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_hasproperty.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_shapesimilarto.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_similar_event.yaml
 delete mode 100644 lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_subevent.yaml
 delete mode 100755 lmms_eval/tasks/siwei_bench_sub3_vd/utils.py
 delete mode 100644 test.py
 delete mode 100644 test_blip.py
 delete mode 100644 test_cambrian.py
 create mode 100644 visual_code/echart.js
 create mode 100644 visual_code/echart_new.js
 create mode 100644 visual_code/plot_map.py

diff --git a/.gitignore b/.gitignore
index 1188fd9dd..b04121242 100755
--- a/.gitignore
+++ b/.gitignore
@@ -104,3 +104,8 @@ describe/
 clip-vit-base-patch32/
 ai2d_check/
 lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc
+data_clean/
+gpt_4v/
+.js
+clip-vit-base-patch32/
+*.whl
\ No newline at end of file
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
index bd8ae7dde..52e147672 100755
--- a/lmms_eval/models/__init__.py
+++ b/lmms_eval/models/__init__.py
@@ -46,24 +46,17 @@
     "qwen":"Qwen",
     'llama':"Llama",
     "gemma":"Gemma",
-<<<<<<< HEAD
-    "cambrian":"Cambrian",
-=======
-    "cambrian_8b":"Cambrian_8b",
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
+    "cambrian_model":"Cambrian",
     "internvl2": "InternVL2",
     "mantis": "Mantis",
     "emu2": "Emu2",
     "paligemma":"Paligemma",
-<<<<<<< HEAD
     "internvl2_large":"InternVL2_large",
     "MIO_sft":"MIO",
     "onevision":"onevision",
     "onevision_large":"onevision_large",
     "cogvlm2":"cogvlm2",
     "MIO_batch":"MIO_batch"
-=======
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 }
 
 for model_name, model_class in AVAILABLE_MODELS.items():
diff --git a/lmms_eval/models/cambrian_8b.py b/lmms_eval/models/cambrian_8b.py
deleted file mode 100644
index 6fb730174..000000000
--- a/lmms_eval/models/cambrian_8b.py
+++ /dev/null
@@ -1,438 +0,0 @@
-import torch
-import os
-import sys
-from tqdm import tqdm
-from lmms_eval import utils
-from lmms_eval.api.instance import Instance
-from lmms_eval.api.model import lmms
-from PIL import Image
-from datetime import timedelta
-from lmms_eval.api.registry import register_model
-# from lmms_eval.models.model_utils.qwen.qwen_generate_utils import make_context
-# 获取当前文件的目录
-# current_dir = os.path.dirname(os.path.abspath(__file__))
-# # 获取同级目录的路径
-# parent_dir = os.path.dirname(current_dir)
-
-# # 将同级目录添加到模块搜索路径
-# sys.path.append(parent_dir)
-# sys.path.append(current_dir)
-# os.chdir('/ML-A100/team/mm/zk/lmms-eval/lmms_eval/models/cambrian')
-from cambrian.model.builder import load_pretrained_model
-from cambrian.conversation import conv_templates, SeparatorStyle
-from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
-from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
-from accelerate.state import AcceleratorState
-from typing import List, Optional, Union, Tuple
-import uuid
-import warnings
-from transformers import PreTrainedTokenizer
-temperature = 0
-CONTROLLER_HEART_BEAT_EXPIRATION = 30
-WORKER_HEART_BEAT_INTERVAL = 15
-
-LOGDIR = "."
-
-# Model Constants
-IGNORE_INDEX = -100
-IMAGE_TOKEN_INDEX = -200
-DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
-DEFAULT_IM_START_TOKEN = "<im_start>"
-DEFAULT_IM_END_TOKEN = "<im_end>"
-IMAGE_PLACEHOLDER = "<image-placeholder>"
-
-
-conv_mode = "llama_3" 
-
-warnings.simplefilter("ignore", category=DeprecationWarning)
-warnings.filterwarnings("ignore")
-
-from loguru import logger as eval_logger
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
-
-def process(image, question, tokenizer, image_processor, model_config):
-    qs = question
-
-    if model_config.mm_use_im_start_end:
-        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
-    else:
-        qs = DEFAULT_IMAGE_TOKEN + '\n' + str(qs)
-
-    conv = conv_templates[conv_mode].copy()
-    conv.append_message(conv.roles[0], qs)
-    conv.append_message(conv.roles[1], None)
-    prompt = conv.get_prompt()
-    
-    image_size = [image.size]
-    image_tensor = process_images([image], image_processor, model_config)
-
-    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
-
-    return input_ids, image_tensor, image_size, prompt
-
-def make_context(
-    tokenizer: PreTrainedTokenizer,
-    query: str,
-    history: List[Tuple[str, str]] = None,
-    system: str = "",
-    max_window_size: int = 6144,
-    chat_format: str = "chatml",
-):
-    if history is None:
-        history = []
-
-    if chat_format == "chatml":
-        im_start, im_end = "<|im_start|>", "<|im_end|>"
-        im_start_tokens = [tokenizer.im_start_id]
-        im_end_tokens = [tokenizer.im_end_id]
-        nl_tokens = tokenizer.encode("\n")
-
-        def _tokenize_str(role, content):
-            return f"{role}\n{content}", tokenizer.encode(role, allowed_special=set(tokenizer.IMAGE_ST)) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST))
-
-        system_text, system_tokens_part = _tokenize_str("system", system)
-        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
-
-        raw_text = ""
-        context_tokens = []
-
-        for turn_query, turn_response in reversed(history):
-            query_text, query_tokens_part = _tokenize_str("user", turn_query)
-            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
-            if turn_response is not None:
-                response_text, response_tokens_part = _tokenize_str("assistant", turn_response)
-                response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
-
-                next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
-                prev_chat = f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
-            else:
-                next_context_tokens = nl_tokens + query_tokens + nl_tokens
-                prev_chat = f"\n{im_start}{query_text}{im_end}\n"
-
-            current_context_size = len(system_tokens) + len(next_context_tokens) + len(context_tokens)
-            if current_context_size < max_window_size:
-                context_tokens = next_context_tokens + context_tokens
-                raw_text = prev_chat + raw_text
-            else:
-                break
-
-        context_tokens = system_tokens + context_tokens
-        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
-        context_tokens += nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens + im_start_tokens + tokenizer.encode("assistant") + nl_tokens
-        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
-
-    elif chat_format == "raw":
-        raw_text = query
-        context_tokens = tokenizer.encode(raw_text)
-    else:
-        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
-
-    return raw_text, context_tokens
-
-
-
-@register_model("cambrian_8b")
-class Cambrian_8b(lmms):
-    """
-    cambrian_8b model
-   https://huggingface.co/nyu-visionx/cambrian-8b
-    """
-
-    def __init__(
-        self,
-        pretrained: str = "/ML-A100/team/mm/zhangge/models/cambrian_8b",
-        # pretrained: str = "nyu-visionx/cambrian-8b",
-        device: Optional[str] = "cuda",
-        device_map="auto",
-        batch_size: Optional[Union[int, str]] = 1,
-        trust_remote_code: Optional[bool] = True,
-        use_cache=True,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        assert kwargs == {}, f"Unexpected kwargs: {kwargs}"
-
-
-        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
-        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
-        if accelerator.num_processes > 1 and device_map == "":
-            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
-            self.device_map = f"cuda:{accelerator.local_process_index}"
-        else:
-            self._device = device
-            self.device_map = device_map
-
-        model_name = pretrained  # Assuming `pretrained` is a path or model id
-        # self._model = AutoModelForCausalLM.from_pretrained(
-        #     pretrained,
-        #     torch_dtype="auto",
-        #     device_map=self.device_map
-        # )
-        # self._model = AutoModel.from_pretrained(self.pretrained, device_map=self.device_map, trust_remote_code=True)
-        # self._model = AutoModelForCausalLM.from_pretrained(pretrained, device_map=self._device, trust_remote_code=trust_remote_code).eval()
-        # model_path = os.path.expanduser("nyu-visionx/cambrian-8b")
-        model_name = get_model_name_from_path(pretrained)
-        tokenizer, model, self.image_processor, context_len = load_pretrained_model(pretrained, None, model_name,device_map=self.device_map)
-        self._model = model
-        self._tokenizer = tokenizer
-        self.tokenizer.pad_token_id=self.tokenizer.eos_token_id
-        # device_map=self.device_map
-        # self._tokenizer = AutoTokenizer.from_pretrained(pretrained)
-        self._model.eval()
-        # self._config = self._model.config
-        # self.model.tie_weights()
-        self.batch_size_per_gpu = int(batch_size)
-        self.use_cache = use_cache
-        if accelerator.num_processes > 1:
-            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
-            # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
-            # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works
-            # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work.
-            if accelerator.distributed_type == DistributedType.DEEPSPEED:
-                kwargs = {
-                    "train_micro_batch_size_per_gpu": self.batch_size_per_gpu,
-                    "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes,
-                }
-                AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs)
-                eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0")
-            if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED:
-                self._model = accelerator.prepare(self.model)
-            else:
-                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
-            self.accelerator = accelerator
-            if self.accelerator.is_local_main_process:
-                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
-            self._rank = self.accelerator.local_process_index
-            self._world_size = self.accelerator.num_processes
-        else:
-            self.model.to(self._device)
-            self._rank = 0
-            self._word_size = 1
-        # if accelerator.num_processes > 1:
-        #     self._model = accelerator.prepare(self._model)
-        # else:
-        #     self.model = torch.nn.DataParallel(self.model)
-        #     self.model.to(self._device)
-
-        self.accelerator = accelerator
-
-    @property
-    def config(self):
-        # return the associated transformers.AutoConfig for the given pretrained model.
-        return self._config
-
-    @property
-    def tokenizer(self):
-        return self._tokenizer
-
-    @property
-    def model(self):
-        # returns the model, unwrapping it if using Accelerate
-        if hasattr(self, "accelerator"):
-            return self.accelerator.unwrap_model(self._model)
-        else:
-            return self._model
-
-    @property
-    def eot_token_id(self):
-        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
-        return self.tokenizer.eos_token_id
-
-    @property
-    def max_length(self):
-        return self._max_length
-
-    # should be deleted since max_new_tokens is decided by gen_kwargs not a model property
-    # @property
-    # def max_new_tokens(self) -> int:
-    #     return 256
-
-    @property
-    def batch_size(self):
-        return self.batch_size_per_gpu
-
-    @property
-    def device(self):
-        return self._device
-
-    @property
-    def rank(self):
-        return self._rank
-
-    @property
-    def world_size(self):
-        return self._world_size
-
-    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
-        res = []
-        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
-
-        for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
-            # encode, pad, and truncate contexts for this batch
-            if type(doc_to_target) == str:
-                continuation = doc_to_target
-            else:
-                continuation = doc_to_target(self.task_dict[task][split][doc_id])
-            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
-            visuals = self.flatten(visuals)
-            query = []
-            visual_paths = []
-            for visual in visuals:
-                name = uuid.uuid4().hex.upper()[0:6]
-                visual.save(f"/tmp/{name}.png")
-                visual_paths.append(f"/tmp/{name}.png")
-                query.append({"image": f"/tmp/{name}.png"})
-
-            # Make a copy for query to save context (text that needs to be masked)
-            context_query = [_ for _ in query]
-            context_query.append({"text": contexts})
-            query.append({"text": contexts + continuation})
-
-            context_query = self.tokenizer.from_list_format(context_query)
-            query = self.tokenizer.from_list_format(query)
-
-            raw_contxt_text, context_tokens = make_context(
-                self.tokenizer, context_query, history=None, system="You are a helpful assistant", max_window_size=self.model.generation_config.max_window_size, chat_format=self.model.generation_config.chat_format
-            )
-            context_tokens = torch.tensor([context_tokens])
-
-            raw_continuation_text, continuation_tokens = make_context(
-                self.tokenizer, query, history=None, system="You are a helpful assistant", max_window_size=self.model.generation_config.max_window_size, chat_format=self.model.generation_config.chat_format
-            )
-            continuation_tokens = torch.tensor([continuation_tokens]).to(self.model.device)
-            attn_mask = torch.ones_like(continuation_tokens).to(self.model.device)
-            labels = continuation_tokens.clone().to(self.model.device)
-            labels[:, : context_tokens.shape[1]] = -100
-            with torch.inference_mode():
-                outputs = self.model(input_ids=continuation_tokens, labels=labels, attention_mask=attn_mask)
-            loss = outputs.loss
-            logits = outputs["logits"]
-            greedy_tokens = logits.argmax(dim=-1)
-            cont_toks = continuation_tokens[:, context_tokens.shape[1] :]
-            greedy_tokens = greedy_tokens[:, context_tokens.shape[1] : continuation_tokens.shape[1]]  # [1, seq]
-            max_equal = (greedy_tokens == cont_toks).all()
-            res.append((float(loss.item()), bool(max_equal)))
-            pbar.update(1)
-
-        pbar.close()
-        return res
-
-    def flatten(self, input):
-        new_list = []
-        for i in input:
-            for j in i:
-                new_list.append(j)
-        return new_list
-
-    def generate_until(self, requests: List[Instance]) -> List[str]:
-        res = []
-
-        def _collate(x):
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-            toks = self.tokenizer.encode(x[0])
-            return -len(toks), x[0]
-
-        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
-        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
-        for chunk in chunks:
-            contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
-            task = task[0]
-            split = split[0]
-            visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]
-            visuals = self.flatten(visuals)
-            visual_paths = []
-            # save images to /tmp, name generated by hash function
-            # qwen accept image path. Have to do it here....
-            for visual in visuals:
-                name = uuid.uuid4().hex.upper()[0:6]
-                visual.save(f"/ML-A100/team/mm/zk/lmms-eval/lmms_eval/tmp/{name}.png")
-                visual_paths.append(f"/ML-A100/team/mm/zk/lmms-eval/lmms_eval/tmp/{name}.png")
-
-            # we assume all gen kwargs in the batch are the same
-            # this is safe to assume because the `grouper` object ensures it.
-            gen_kwargs = all_gen_kwargs[0]
-
-            # Set default values for until and max_new_tokens
-            # until = [self.tokenizer.decode(self.eot_token_id)]
-
-            # # Update values from gen_kwargs if present
-            # if "until" in gen_kwargs:
-            #     until = gen_kwargs.pop("until")
-            #     if isinstance(until, str):
-            #         until = [until]
-            #     elif not isinstance(until, list):
-            #         raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}")
-            # preconfigure gen_kwargs with defaults
-            if "image_sizes" not in gen_kwargs:
-                try:
-                    gen_kwargs["image_sizes"] = [visuals[0].size]
-                except:
-                    gen_kwargs["image_sizes"] = None
-            if "max_new_tokens" not in gen_kwargs:
-                gen_kwargs["max_new_tokens"] = 1024
-            if "temperature" not in gen_kwargs:
-                gen_kwargs["temperature"] = 0
-            if "top_p" not in gen_kwargs:
-                gen_kwargs["top_p"] = None
-            if "num_beams" not in gen_kwargs:
-                gen_kwargs["num_beams"] = 1
-
-            # self.tokenizer.pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eod_id
-            # image_path = input("image path: ")
-            image = Image.open(visual_paths[0]).convert('RGB')
-            # question = input("question: ")
-            question=contexts[0]
-            # print(question)
-            input_ids, image_tensor, image_sizes, prompt = process(image, question, self.tokenizer, self.image_processor, self.model.config)
-            input_ids = input_ids.to(device='cuda', non_blocking=True)
-            with torch.inference_mode():
-                output_ids = self.model.generate(
-                    input_ids,
-                    images=image_tensor,
-                    image_sizes=image_sizes,
-                    do_sample=True if temperature > 0 else False,
-                    temperature=gen_kwargs["temperature"],
-                    num_beams=gen_kwargs["num_beams"],
-                    max_new_tokens=gen_kwargs["max_new_tokens"],
-                    use_cache=True)
-                text_outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-            # cont_toks_list = cont.tolist()
-            # for cont_toks, context in zip(cont_toks_list, contexts):
-            #     # discard context + left-padding toks if using causal decoder-only LMM
-            #     cont_toks = cont_toks[input_ids.input_ids.shape[1] :]
-            #     text_outputs = self.tokenizer.decode(cont_toks, skip_special_tokens=True).strip()
-            #     for term in until:
-            #         if len(term) > 0:
-            #             # ignore '' separator,
-            #             # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-            #             text_outputs = text_outputs.split(term)[0]
-
-                # res.append(text_outputs)
-
-                # self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs)
-                # remove visuals from tmp
-                # print(text_outputs)
-                for visual_path in visual_paths:
-                    try:
-                        os.remove(visual_path)
-                    except:
-                        pass
-                output_text=[text_outputs]
-                res.extend(output_text)
-                pbar.update(1)
-            # reorder this group of results back to original unsorted form
-        # res = re_ords.get_original(res)
-        
-
-        pbar.close()
-        return res
diff --git a/lmms_eval/models/cambrian.py b/lmms_eval/models/cambrian_model.py
similarity index 93%
rename from lmms_eval/models/cambrian.py
rename to lmms_eval/models/cambrian_model.py
index 0085a27e6..4da0c5e1a 100644
--- a/lmms_eval/models/cambrian.py
+++ b/lmms_eval/models/cambrian_model.py
@@ -8,16 +8,6 @@
 from PIL import Image
 from datetime import timedelta
 from lmms_eval.api.registry import register_model
-# from lmms_eval.models.model_utils.qwen.qwen_generate_utils import make_context
-# 获取当前文件的目录
-# current_dir = os.path.dirname(os.path.abspath(__file__))
-# # 获取同级目录的路径
-# parent_dir = os.path.dirname(current_dir)
-
-# # 将同级目录添加到模块搜索路径
-# sys.path.append(parent_dir)
-# sys.path.append(current_dir)
-# os.chdir('./lmms_eval/models/cambrian')
 from cambrian.model.builder import load_pretrained_model
 from cambrian.conversation import conv_templates, SeparatorStyle
 from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
@@ -131,7 +121,7 @@ def _tokenize_str(role, content):
 
 
 
-@register_model("cambrian")
+@register_model("cambrian_model")
 class Cambrian(lmms):
     """
     cambrian_8b model
@@ -140,8 +130,7 @@ class Cambrian(lmms):
 
     def __init__(
         self,
-        pretrained: str = "/ML-A100/team/mm/zhangge/models/cambrian_8b",
-        # pretrained: str = "nyu-visionx/cambrian-8b",
+        pretrained: str = "nyu-visionx/cambrian-8b",
         device: Optional[str] = "cuda",
         device_map="auto",
         batch_size: Optional[Union[int, str]] = 1,
@@ -353,7 +342,7 @@ def _collate(x):
             for visual in visuals:
                 name = uuid.uuid4().hex.upper()[0:6]
                 visual.save(f"./lmms_eval/tmp/{name}.png")
-                visual_paths.append(f"./lmms_eval/tmp/{name}.png")
+                visual_paths.append(f"./lmms-eval/lmms_eval/tmp/{name}.png")
 
             # we assume all gen kwargs in the batch are the same
             # this is safe to assume because the `grouper` object ensures it.
@@ -370,15 +359,15 @@ def _collate(x):
                 elif not isinstance(until, list):
                     raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}")
             # preconfigure gen_kwargs with defaults
-            if "image_sizes" not in gen_kwargs:
-                try:
-                    gen_kwargs["image_sizes"] = [visuals[0].size]
-                except:
-                    gen_kwargs["image_sizes"] = None
+            # if "image_sizes" not in gen_kwargs:
+            #     try:
+            #         gen_kwargs["image_sizes"] = [visuals[0].size]
+            #     except:
+            #         gen_kwargs["image_sizes"] = None
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
             if "temperature" not in gen_kwargs:
-                gen_kwargs["temperature"] = 0
+                gen_kwargs["temperature"]= 0
             if "top_p" not in gen_kwargs:
                 gen_kwargs["top_p"] = None
             if "num_beams" not in gen_kwargs:
@@ -400,12 +389,13 @@ def _collate(x):
                     input_ids,
                     images=image_tensor,
                     image_sizes=image_sizes,
-                    do_sample=True if temperature > 0 else False,
-                    temperature=gen_kwargs["temperature"],
-                    num_beams=gen_kwargs["num_beams"],
-                    max_new_tokens=gen_kwargs["max_new_tokens"],
-                    # eos_token_id=eot_token_id,
-                    use_cache=True)
+                    **gen_kwargs)
+                    # do_sample=True if temperature > 0 else False,
+                    # temperature=gen_kwargs["temperature"],
+                    # num_beams=gen_kwargs["num_beams"],
+                    # max_new_tokens=gen_kwargs["max_new_tokens"],
+                    # # eos_token_id=eot_token_id,
+                    # use_cache=True)
             text_outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
            
             # cont_toks_list = cont.tolist()
diff --git a/lmms_eval/models/emu2.py b/lmms_eval/models/emu2.py
index 4684599f4..fb77af853 100755
--- a/lmms_eval/models/emu2.py
+++ b/lmms_eval/models/emu2.py
@@ -15,10 +15,7 @@
 from tqdm import tqdm
 from accelerate import Accelerator, DistributedType
 from accelerate.state import AcceleratorState
-<<<<<<< HEAD
 from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
-=======
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 
 from loguru import logger as eval_logger
 
@@ -33,10 +30,7 @@ def __init__(
         self,
         pretrained: str = "BAAI/Emu2",
         device: Optional[str] = "cuda",
-<<<<<<< HEAD
         device_map='auto',
-=======
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
         max_new_tokens: int = 256,
         batch_size: Optional[Union[int, str]] = 1,
         **kwargs,
@@ -51,7 +45,6 @@ def __init__(
         else:
             self._device = device
 
-<<<<<<< HEAD
         self._model = AutoModelForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self._device, trust_remote_code=True)
         # with init_empty_weights():
         #     model = AutoModelForCausalLM.from_pretrained(
@@ -68,9 +61,6 @@ def __init__(
         #     model, 
         #     'local/path/to/hf/version/Emu2-Chat/model',
         #     device_map=device_map).eval()
-=======
-        self._model = AutoModelForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device, trust_remote_code=True)
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
         # self._model = None
         self.model.eval()
         self.model.tie_weights()
@@ -204,15 +194,12 @@ def _collate(x):
             gen_kwargs = all_gen_kwargs[0]
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
-<<<<<<< HEAD
             if "until" in gen_kwargs:
                 until = gen_kwargs.pop("until")
                 if isinstance(until, str):
                     until = [until]
                 elif not isinstance(until, list):
                     raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}")
-=======
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
             contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk)
             visuals = [doc_to_visual(self.task_dict[task][split][ids]) for ids, task, split, doc_to_visual in zip(doc_id, tasks, splits, doc_to_visuals)]
             # print(visuals[0])
@@ -221,11 +208,6 @@ def _collate(x):
             #     visuals = [visuals[idx][0] for idx in range(len(visuals))]  # get the first image in multi-image scenarios.
             # assert len(contexts) == self.batch_size_per_gpu, f"Expected contexts batch size {self.batch_size_per_gpu}, got {len(contexts)}"
             # assert len(visuals) == self.batch_size_per_gpu, f"Expected visuals batch size {self.batch_size_per_gpu}, got {len(visuals)}"
-<<<<<<< HEAD
-=======
-            print('')
-            print(contexts)
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
             formatted_contexts = [f"[<IMG_PLH>]{context}" for context in contexts]
             formatted_contexts[0] = formatted_contexts[0].replace('.', ':')
             # print(formatted_contexts)
@@ -233,12 +215,8 @@ def _collate(x):
             inputs = self.model.build_input_ids(
                     text=formatted_contexts,
                     tokenizer=self.tokenizer,
-<<<<<<< HEAD
                     image=visuals,
                     # device=self.model.device,
-=======
-                    image=visuals
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
                 )
             # inputs = self.model.build_input_ids(
             #     text=formatted_contexts,
@@ -249,21 +227,12 @@ def _collate(x):
             # )
 
             outputs = self.model.generate(
-<<<<<<< HEAD
                     input_ids=inputs["input_ids"].to(self.device),
                     attention_mask=inputs["attention_mask"].to(self.device),
                     image=inputs["image"].to(self.device,torch.bfloat16),
                     # max_new_tokens=gen_kwargs["max_new_tokens"],
                     # length_penalty=-1
                     **gen_kwargs
-=======
-                    input_ids=inputs["input_ids"],
-                    attention_mask=inputs["attention_mask"],
-                    image=inputs["image"].to(torch.bfloat16),
-                    max_new_tokens=gen_kwargs["max_new_tokens"],
-                    length_penalty=-1
-                    # **gen_kwargs
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
                 )
             
             output_text = self._tokenizer.batch_decode(outputs, skip_special_tokens=True)
diff --git a/lmms_eval/models/gemma.py b/lmms_eval/models/gemma.py
index c45cfd5af..93f41a563 100644
--- a/lmms_eval/models/gemma.py
+++ b/lmms_eval/models/gemma.py
@@ -30,7 +30,7 @@ class Gemma(lmms):
     """
     def __init__(
         self,
-        pretrained: str = "/ML-A100/team/mm/zhangge/models/gemma-1.1-7b-it", 
+        pretrained: str = "None", 
         device: Optional[str] = "cuda",
         batch_size: Optional[Union[int, str]] = 1,
         device_map="auto",
diff --git a/lmms_eval/models/gpt4v_01.py b/lmms_eval/models/gpt4v_01.py
new file mode 100755
index 000000000..e65337679
--- /dev/null
+++ b/lmms_eval/models/gpt4v_01.py
@@ -0,0 +1,202 @@
+from io import BytesIO
+from copy import deepcopy
+import numpy as np
+import os
+import base64
+from typing import List, Tuple
+from tqdm import tqdm
+import requests as url_requests
+import time
+
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+from lmms_eval import utils
+
+from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
+from accelerate.state import AcceleratorState
+
+try:
+    from decord import VideoReader, cpu
+except ImportError:
+    pass
+
+from PIL import Image
+
+API_TYPE = os.getenv("API_TYPE", "openai")
+NUM_SECONDS_TO_SLEEP = 30
+from loguru import logger as eval_logger
+from openai import OpenAI
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+
+
+@register_model("gpt4v_01")
+class GPT4V_01(lmms):
+    def __init__(
+        self,
+        model_version: str = "gpt-4-vision-preview",
+        modality: str = "image",
+        max_frames_for_video: int = 10,
+        timeout: int = 120,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # Manually set a image token for GPT4V so that we can search for it
+        # and split the text and image
+        # Here we just use the same token as llava for convenient
+        self.model_version = model_version
+        self.modality = modality
+        self.max_frames_for_video = max_frames_for_video
+        self.image_token = "<image>"
+        self.timeout = timeout
+
+
+        accelerator = Accelerator()
+        # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        else:
+            self.accelerator = accelerator
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+
+        self.device = self.accelerator.device
+
+    # Function to encode the image
+    def encode_image(self, image: Image):
+        output_buffer = BytesIO()
+        image.save(output_buffer, format="PNG")
+        byte_data = output_buffer.getvalue()
+        base64_str = base64.b64encode(byte_data).decode("utf-8")
+        return base64_str
+
+    # Function to encode the video
+    def encode_video(self, video_path, for_get_frames_num):
+        vr = VideoReader(video_path, ctx=cpu(0))
+        total_frame_num = len(vr)
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        frames = vr.get_batch(frame_idx).asnumpy()
+
+        base64_frames = []
+        for frame in frames:
+            img = Image.fromarray(frame)
+            output_buffer = BytesIO()
+            img.save(output_buffer, format="PNG")
+            byte_data = output_buffer.getvalue()
+            base64_str = base64.b64encode(byte_data).decode("utf-8")
+            base64_frames.append(base64_str)
+
+        return base64_frames
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def generate_until(self, requests) -> List[str]:
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            # encode, pad, and truncate contexts for this batch
+            # visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = [doc_to_visual(self.task_dict[task][split][0])]
+            visuals = self.flatten(visuals)
+            imgs = []  # multiple images or frames for video
+            for visual in visuals:
+                if self.modality == "image":
+                    img = self.encode_image(visual)
+                    imgs.append(img)
+                elif self.modality == "video":
+                    frames = self.encode_video(visual, self.max_frames_for_video)
+                    imgs.extend(frames)
+
+            payload = {"model": self.model_version, "messages": []}
+            response_json = {"role": "user", "content": []}
+            # When there is no image token in the context, append the image to the text
+            if self.image_token not in contexts:
+                payload["messages"].append(deepcopy(response_json))
+                payload["messages"][0]["content"].append({"type": "text", "text": contexts})
+                for img in imgs:
+                    payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+            else:
+                contexts = contexts.split(self.image_token)
+                for idx, img in enumerate(imgs):
+                    payload["messages"].append(deepcopy(response_json))
+                    payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
+                    payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+
+                # If n image tokens are in the contexts
+                # contexts will be splitted into n+1 chunks
+                # Manually add it into the payload
+                payload["messages"].append(deepcopy(response_json))
+                payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
+
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+
+            payload["max_tokens"] = gen_kwargs["max_new_tokens"]
+            payload["temperature"] = gen_kwargs["temperature"]
+
+            for attempt in range(5):
+                try:
+                    client = OpenAI(
+                    base_url=API_URL,
+                    api_key=API_KEY,
+                    )
+                    # print(payload["messages"])
+                    response = client.chat.completions.create(
+                        model=self.model_version,
+                        messages=payload["messages"],
+                        max_tokens=payload["max_tokens"],
+                        # timeout=timeout,
+                        temperature=payload["temperature"],
+                    )
+
+                    content = response.choices[0].message.content.strip()
+                    break  # If successful, break out of the loop
+
+                except Exception as e:
+                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
+                    if attempt < 5 - 1:  # If we have retries left, sleep and then continue to next attempt
+                        time.sleep(NUM_SECONDS_TO_SLEEP)
+                    else:  # If this was the last attempt, log and return empty
+                        eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
+                        eval_logger.error(f"Response: {response}")
+                        content = ""
+            res.append(content)
+            pbar.update(1)
+        pbar.close()
+        return res
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        # TODO
+        assert False, "GPT4V not support"
diff --git a/lmms_eval/models/gpt4v_01_batch.py b/lmms_eval/models/gpt4v_01_batch.py
new file mode 100755
index 000000000..9246119fb
--- /dev/null
+++ b/lmms_eval/models/gpt4v_01_batch.py
@@ -0,0 +1,224 @@
+from io import BytesIO
+from copy import deepcopy
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import numpy as np
+import os
+import base64
+from typing import List, Tuple
+from tqdm import tqdm
+import requests as url_requests
+import time
+
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+from lmms_eval import utils
+
+from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
+from accelerate.state import AcceleratorState
+
+try:
+    from decord import VideoReader, cpu
+except ImportError:
+    pass
+
+from PIL import Image
+
+API_TYPE = os.getenv("API_TYPE", "openai")
+NUM_SECONDS_TO_SLEEP = 30
+from loguru import logger as eval_logger
+from openai import OpenAI
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+
+
+# def is_openai_v1() -> bool:
+#     from importlib.metadata import version
+#     from packaging.version import Version, parse
+#     _version = parse(version("openai"))
+#     return _version >= Version("1.0.0")
+   
+   
+# if is_openai_v1():
+#     API_URL = os.path.join(API_URL, "openai")
+
+
+
+@register_model("gpt4v_01_batch")
+class GPT4V_01_batch(lmms):
+    def __init__(
+        self,
+        model_version: str = "gpt-4-vision-preview",
+        modality: str = "image",
+        max_frames_for_video: int = 10,
+        timeout: int = 120,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # Manually set a image token for GPT4V so that we can search for it
+        # and split the text and image
+        # Here we just use the same token as llava for convenient
+        self.model_version = model_version
+        self.modality = modality
+        self.max_frames_for_video = max_frames_for_video
+        self.image_token = "<image>"
+        self.timeout = timeout
+
+
+        accelerator = Accelerator()
+        # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        else:
+            self.accelerator = accelerator
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+
+        self.device = self.accelerator.device
+
+    # Function to encode the image
+    def encode_image(self, image: Image):
+        output_buffer = BytesIO()
+        image.save(output_buffer, format="PNG")
+        byte_data = output_buffer.getvalue()
+        base64_str = base64.b64encode(byte_data).decode("utf-8")
+        return base64_str
+
+    # Function to encode the video
+    def encode_video(self, video_path, for_get_frames_num):
+        vr = VideoReader(video_path, ctx=cpu(0))
+        total_frame_num = len(vr)
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        frames = vr.get_batch(frame_idx).asnumpy()
+
+        base64_frames = []
+        for frame in frames:
+            img = Image.fromarray(frame)
+            output_buffer = BytesIO()
+            img.save(output_buffer, format="PNG")
+            byte_data = output_buffer.getvalue()
+            base64_str = base64.b64encode(byte_data).decode("utf-8")
+            base64_frames.append(base64_str)
+
+        return base64_frames
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def process_request(self, request, idx):
+        """处理单个请求"""
+        contexts, gen_kwargs, doc_to_visual, doc_id, task, split = request.args
+
+        visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+        visuals = self.flatten(visuals)
+        imgs = []  # multiple images or frames for video
+
+        for visual in visuals:
+            if self.modality == "image":
+                img = self.encode_image(visual)
+                imgs.append(img)
+            elif self.modality == "video":
+                frames = self.encode_video(visual, self.max_frames_for_video)
+                imgs.extend(frames)
+
+        payload = {"model": self.model_version, "messages": []}
+        response_json = {"role": "user", "content": []}
+
+        if self.image_token not in contexts:
+            payload["messages"].append(deepcopy(response_json))
+            payload["messages"][0]["content"].append({"type": "text", "text": contexts})
+            for img in imgs:
+                payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+        else:
+            contexts = contexts.split(self.image_token)
+            for idx, img in enumerate(imgs):
+                payload["messages"].append(deepcopy(response_json))
+                payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
+                payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+
+            payload["messages"].append(deepcopy(response_json))
+            payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
+
+        if "max_new_tokens" not in gen_kwargs:
+            gen_kwargs["max_new_tokens"] = 1024
+        if "temperature" not in gen_kwargs:
+            gen_kwargs["temperature"] = 0
+        if "top_p" not in gen_kwargs:
+            gen_kwargs["top_p"] = None
+        if "num_beams" not in gen_kwargs:
+            gen_kwargs["num_beams"] = 1
+
+        payload["max_tokens"] = gen_kwargs["max_new_tokens"]
+        payload["temperature"] = gen_kwargs["temperature"]
+        for attempt in range(5):
+            try:
+                client = OpenAI(
+                    base_url=API_URL,
+                    api_key=API_KEY,
+                )
+
+                response = client.chat.completions.create(
+                    model=self.model_version,
+                    messages=payload["messages"],
+                    max_tokens=payload["max_tokens"],
+                    temperature=payload["temperature"],
+                )
+
+                content = response.choices[0].message.content.strip()
+                return content  # 成功时返回内容
+
+            except Exception as e:
+                eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
+                if attempt < 5 - 1:  # 如果还有重试机会
+                    time.sleep(NUM_SECONDS_TO_SLEEP)
+                else:
+                    eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
+                    return ""  # 返回空字符串作为失败结果
+
+    def generate_until(self, requests) -> List[str]:
+        res = [None] * len(requests)  # 使用 None 初始化结果数组，长度与请求列表相同
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+        NUM_WORKERS = 16
+        with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+            future_to_request = {executor.submit(lambda reg=reg, idx=idx: self.process_request(reg, idx), reg): idx for idx, reg in enumerate(requests)}
+
+            for future in as_completed(future_to_request):
+                idx = future_to_request[future]  # 获取对应请求的索引
+                try:
+                    content = future.result()
+                except Exception as e:
+                    eval_logger.error(f"Error processing request: {str(e)}")
+                    content = ""
+                
+                res[idx] = content  # 将结果放到对应的位置
+                pbar.update(1)
+
+        pbar.close()
+        return res  # 返回结果数组
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        # TODO
+        assert False, "GPT4V not support"
diff --git a/lmms_eval/models/idefics2.py b/lmms_eval/models/idefics2.py
index 0d73d2c95..1cb82f076 100644
--- a/lmms_eval/models/idefics2.py
+++ b/lmms_eval/models/idefics2.py
@@ -215,19 +215,11 @@ def _collate(x):
                 prompt = self._processor.apply_chat_template(message, add_generation_prompt=True)
                 prompts.append(prompt)
             
-<<<<<<< HEAD
             # print(contexts)
             # print(prompts)
             # print(visuals)
             # input()
 # 
-=======
-            print(contexts)
-            print(prompts)
-            print(visuals)
-            input()
-
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
             inputs = self._processor(text=prompts, images=visuals, padding=True, return_tensors="pt")
             if 'max_new_tokens' not in gen_kwargs:
                 gen_kwargs['max_new_tokens']=1024
diff --git a/lmms_eval/models/internvl2.py b/lmms_eval/models/internvl2.py
index 5115c6974..94fa3de03 100644
--- a/lmms_eval/models/internvl2.py
+++ b/lmms_eval/models/internvl2.py
@@ -12,11 +12,7 @@
 from lmms_eval.api.model import lmms
 from tqdm import tqdm
 import logging
-<<<<<<< HEAD
 import math
-=======
-
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 eval_logger = logging.getLogger("eval_logger")
 
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
@@ -125,7 +121,6 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
 from accelerate.state import AcceleratorState
 from accelerate.utils import InitProcessGroupKwargs
 
-<<<<<<< HEAD
 # def split_model(model_name):
 #     device_map = {}
 #     world_size = 8
@@ -150,8 +145,6 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
 #     device_map['language_model.lm_head'] = 0
 #     device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
 #     return device_map
-=======
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 
 @register_model("internvl2")
 class InternVL2(lmms):
@@ -173,11 +166,7 @@ def __init__(
         batch_size = int(batch_size)
         assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}."
         self.batch_size_per_gpu = batch_size
-<<<<<<< HEAD
         model_key=self.path.split('/')[-1]
-=======
-
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
         accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
         accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
         self.accelerator = accelerator
diff --git a/lmms_eval/models/minicpm_v.py b/lmms_eval/models/minicpm_v.py
index e117e7fad..26c48e800 100644
--- a/lmms_eval/models/minicpm_v.py
+++ b/lmms_eval/models/minicpm_v.py
@@ -180,13 +180,9 @@ def _collate(x):
                 elif not isinstance(until, list):
                     raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}")
             assert self.batch_size_per_gpu == 1, "Do not support batch_size_per_gpu > 1 for now"
-<<<<<<< HEAD
             # assert len(visuals) == 1, "MiniCPM_V interface does not support bn_image > 1 for now"
             if len(visuals)>1:
                 visuals=visuals[:1]#debug use first image
-=======
-            assert len(visuals) == 1, "MiniCPM_V interface does not support bn_image > 1 for now"
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
             context = contexts[0]
             if "<image>" in context:
                 # minicpm does not expect the <image> tag
diff --git a/lmms_eval/models/paligemma.py b/lmms_eval/models/paligemma.py
index 97e252085..1931264cc 100755
--- a/lmms_eval/models/paligemma.py
+++ b/lmms_eval/models/paligemma.py
@@ -4,11 +4,7 @@
 warnings.filterwarnings("ignore")
 
 from accelerate import Accelerator, DistributedType
-<<<<<<< HEAD
 from transformers import AutoProcessor, PaliGemmaForConditionalGeneration,AutoTokenizer
-=======
-from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 from lmms_eval.api.model import lmms
 from lmms_eval.api.registry import register_model
 import torch
@@ -47,7 +43,6 @@ def __init__(
         else:
             self._device = device
 
-<<<<<<< HEAD
         # self._model = FuyuForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device)
         self._model = PaliGemmaForConditionalGeneration.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device)
         self.model.eval()
@@ -57,15 +52,6 @@ def __init__(
 
         # self.image_processor = FuyuImageProcessor()
         self.processor = AutoProcessor.from_pretrained(pretrained)
-=======
-        self._model = PaliGemmaForConditionalGeneration.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device, revision="bfloat16", trust_remote_code=True).eval()
-        # self._model = None
-        self.model.eval()
-        self.model.tie_weights()
-        self._tokenizer = AutoProcessor.from_pretrained(pretrained)
-        self._config = self.model.config
-
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
         self.max_new_tokens = max_new_tokens
         self.batch_size_per_gpu = int(batch_size)
         accelerator = Accelerator()
@@ -166,15 +152,12 @@ def generate_until(self, requests: List[Instance]) -> List[str]:
         res = []
 
         def _collate(x):
-<<<<<<< HEAD
             # the negative sign on len(toks) sorts descending - this has a few advantages:
             # - time estimates will always be over not underestimates, which is more useful for planning
             # - to know the size of a batch when going through the list, you know the first one is always the batch
             #   padded context length. this is useful to simplify the batching logic and more importantly to make
             #   automatic adaptive batches much much easier to implement
             # - any OOMs will happen right away rather than near the end
-=======
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
             toks = self.tok_encode(x[0])
             return -len(toks), x[0]
 
@@ -184,7 +167,6 @@ def _collate(x):
         pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
 
         for chunk in chunks:
-<<<<<<< HEAD
             contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
             task = task[0]
             split = split[0]
@@ -245,50 +227,6 @@ def _collate(x):
             # app = [gen_text.split("\x04")[1].strip(" ").strip("\n") for gen_text in generation_texts]
             # print("输入：",formatted_contexts)
             # res.extend(response)
-=======
-            # contexts, all_gen_kwargs, doc_to_visual, doc_id, tasks, split = zip(*chunk)
-            contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk)
-            gen_kwargs = all_gen_kwargs[0]
-            if "max_new_tokens" not in gen_kwargs:
-                gen_kwargs["max_new_tokens"] = 1024
-            contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk)
-            visuals = [doc_to_visual(self.task_dict[task][split][ids]) for ids, task, split, doc_to_visual in zip(doc_id, tasks, splits, doc_to_visuals)]
-            # print(visuals[0])
-            visuals = [v[0] for v in visuals]
-            
-            formatted_contexts = [context for context in contexts]
-            formatted_contexts[0] = formatted_contexts[0].replace('.', ':')
-
-            # inputs = self.model.build_input_ids(
-            #         text=formatted_contexts,
-            #         tokenizer=self.tokenizer,
-            #         image=visuals
-            #     )
-            
-            model_inputs = self.tokenizer(text=formatted_contexts, images=visuals, return_tensors="pt").to("cuda")
-            input_len = model_inputs["input_ids"].shape[-1]
-            # print('question is: ')
-            print(formatted_contexts)
-            generation = self.model.generate(**model_inputs, max_new_tokens = gen_kwargs['max_new_tokens'], do_sample=False)
-            generation = generation[0][input_len:]
-            decoded = self.tokenizer.decode(generation, skip_special_tokens=True)
-            # print(decoded)
-
-            # outputs = self.tokenizer.generate(
-            #         input_ids=inputs["input_ids"],
-            #         attention_mask=inputs["attention_mask"],
-            #         image=inputs["image"].to(torch.bfloat16),
-            #         max_new_tokens=gen_kwargs["max_new_tokens"],
-            #         length_penalty=-1
-            #         # **gen_kwargs
-            #     )
-        
-            
-            # output_text = self._tokenizer.batch_decode(outputs, skip_special_tokens=True)
-            # output_text = [t.strip(" ").strip("\n") for t in output_text]
-            # print(decoded)
-            res.extend([decoded])
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
             pbar.update(1)
 
         pbar.close()
@@ -337,11 +275,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
     def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]:
         """ """
         add_special_tokens = False if add_special_tokens is None else add_special_tokens
-<<<<<<< HEAD
         encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
-=======
-        encoding = self.tokenizer.tokenizer.encode(string, add_special_tokens=add_special_tokens)
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
         # left-truncate the encoded context to be at most `left_truncate_len` tokens long
         if left_truncate_len:
             encoding = encoding[-left_truncate_len:]
diff --git a/lmms_eval/models/qwen.py b/lmms_eval/models/qwen.py
index 9e37c69b8..88f59562a 100644
--- a/lmms_eval/models/qwen.py
+++ b/lmms_eval/models/qwen.py
@@ -31,7 +31,7 @@ class Qwen(lmms):
 
     def __init__(
         self,
-        pretrained: str = "/ML-A100/team/mm/zhangge/models/Qwen2-72B-Instruct",
+        pretrained: str = None,
         device: Optional[str] = "cuda",
         batch_size: Optional[Union[int, str]] = 1,
         device_map="auto",
diff --git a/lmms_eval/models/test_swwu.py b/lmms_eval/models/test_swwu.py
deleted file mode 100644
index 1392c7894..000000000
--- a/lmms_eval/models/test_swwu.py
+++ /dev/null
@@ -1,78 +0,0 @@
-<<<<<<< HEAD
-from PIL import Image
-import requests
-import torch 
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-tokenizer = AutoTokenizer.from_pretrained("/ML-A100/team/mm/wangzekun/kangz/ViLLaMA/models/Emu2-Chat")
-
-model = AutoModelForCausalLM.from_pretrained(
-    "/ML-A100/team/mm/wangzekun/kangz/ViLLaMA/models/Emu2-Chat",
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-    trust_remote_code=True).to('cuda').eval()
-
-
-# `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings. 
-# the number of `[<IMG_PLH>]` should be equal to the number of input images
-
-query = 'Describe the image in details:' 
-image = Image.open('./image2.jpg').convert('RGB')
-
-print([query])
-print([image])
-inputs = model.build_input_ids(
-    text=[query],
-    tokenizer=tokenizer,
-    image=[image]
-)
-
-with torch.no_grad():
-     outputs = model.generate(
-        input_ids=inputs["input_ids"],
-        attention_mask=inputs["attention_mask"],
-        image=inputs["image"].to(torch.bfloat16),
-        max_new_tokens=64,
-        length_penalty=-1)
-
-output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-
-print(output_text)
-=======
-from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
-from PIL import Image
-import requests
-import torch
-
-model_id = "/ML-A100/team/mm/zhangge/models/paligemma-3b-pt-224"
-device = "cuda:0"
-dtype = torch.bfloat16
-
-
-model = PaliGemmaForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=dtype,
-    device_map=device,
-    revision="bfloat16",
-).eval()
-processor = AutoProcessor.from_pretrained(model_id)
-
-
-prompt = 'What is the difference of those two images:' 
-image = Image.open('./image2.jpg').convert('RGB')
-
-model_inputs = processor(text=[prompt], images=[image, image], return_tensors="pt").to(model.device)
-
-print(model_inputs)
-
-input()
-input_len = model_inputs["input_ids"].shape[-1]
-
-with torch.inference_mode():
-    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
-    generation = generation[0][input_len:]
-    decoded = processor.decode(generation, skip_special_tokens=True)
-    print(decoded)
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
diff --git a/lmms_eval/models/yi.py b/lmms_eval/models/yi.py
index b8fd2e0f2..ba08fd253 100644
--- a/lmms_eval/models/yi.py
+++ b/lmms_eval/models/yi.py
@@ -31,7 +31,7 @@ class Yi(lmms):
 
     def __init__(
         self,
-        pretrained: str = "/ML-A100/team/mm/zhangge/models/Yi-1.5-34B-Chat",
+        pretrained: str = "None",
         device: Optional[str] = "cuda",
         batch_size: Optional[Union[int, str]] = 1,
         device_map="auto",
diff --git a/lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml b/lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml
index dd2973d4f..c2e9d79b3 100755
--- a/lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml
+++ b/lmms_eval/tasks/ai2d_all_VD/ai2d_all.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-VD/ai2d
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-VD/ai2d
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 task: "ai2d_vd"
 dataset_kwargs:
   token: True
diff --git a/lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml b/lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml
index 07eee1bbc..f9069efca 100755
--- a/lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml
+++ b/lmms_eval/tasks/ai2d_suit/ai2d_suit.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-jsonl/ai2d
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-jsonl/ai2d
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 task: "ai2d_suit"
 dataset_kwargs:
   token: True
diff --git a/lmms_eval/tasks/chartqa_all_vd/chartqa.yaml b/lmms_eval/tasks/chartqa_all_vd/chartqa.yaml
index f8d5300b0..5bb135f84 100755
--- a/lmms_eval/tasks/chartqa_all_vd/chartqa.yaml
+++ b/lmms_eval/tasks/chartqa_all_vd/chartqa.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-VD/ChartQA/
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-VD/ChartQA/
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 dataset_kwargs:
   token: True
 task: "chartqa_vd"
diff --git a/lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml b/lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml
index 8bee1b819..0471b0c2b 100755
--- a/lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml
+++ b/lmms_eval/tasks/chartqa_suit_vd/chartqa.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-selected-VD/ChartQA/
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-selected-VD/ChartQA/
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 dataset_kwargs:
   token: True
 task: "chartqa_suit_vd"
diff --git a/lmms_eval/tasks/mmmu/mmmu_aug.yaml b/lmms_eval/tasks/mmmu/mmmu_aug.yaml
index 492c8f108..d335d0896 100755
--- a/lmms_eval/tasks/mmmu/mmmu_aug.yaml
+++ b/lmms_eval/tasks/mmmu/mmmu_aug.yaml
@@ -1,4 +1,4 @@
-dataset_path: /ML-A100/team/mm/zhangge/MMMU/right_and_error/reorder/MMMU_Dataset/mmmu_aug_dataset
+dataset_path: ./MMMU/right_and_error/reorder/MMMU_Dataset/mmmu_aug_dataset
 task: "mmmu_aug"
 test_split: train
 output_type: generate_until
diff --git a/lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml b/lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml
index 7902ea848..cc754d478 100644
--- a/lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml
+++ b/lmms_eval/tasks/mmmu/mmmu_group_img_aug.yaml
@@ -1,4 +1,4 @@
-dataset_path: /ML-A100/team/mm/zhangge/MMMU_V/verify_interface_vision/annotate_stage2/processed_jsonl/shuffled.jsonl
+dataset_path: ./MMMU_V/verify_interface_vision/annotate_stage2/processed_jsonl/shuffled.jsonl
 task: "mmmu_aug_group_img"
 test_split: augument
 output_type: generate_until
diff --git a/lmms_eval/tasks/ocrbench_all_vd/ocrbench_suit.yaml b/lmms_eval/tasks/ocrbench_all_vd/ocrbench_suit.yaml
index 1d860e5b7..28a878752 100644
--- a/lmms_eval/tasks/ocrbench_all_vd/ocrbench_suit.yaml
+++ b/lmms_eval/tasks/ocrbench_all_vd/ocrbench_suit.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-VD/ocrbench
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-VD/ocrbench
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 dataset_kwargs:
   token: True
 task: "ocrbench_vd"
diff --git a/lmms_eval/tasks/ocrbench_suit/ocrbench_suit.yaml b/lmms_eval/tasks/ocrbench_suit/ocrbench_suit.yaml
index d6b69e93b..3e6286b7e 100644
--- a/lmms_eval/tasks/ocrbench_suit/ocrbench_suit.yaml
+++ b/lmms_eval/tasks/ocrbench_suit/ocrbench_suit.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-jsonl/ocrbench
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-jsonl/ocrbench
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 dataset_kwargs:
   token: True
 task: "ocrbench_suit"
diff --git a/lmms_eval/tasks/ocrbench_suit_vd/ocrbench_suit.yaml b/lmms_eval/tasks/ocrbench_suit_vd/ocrbench_suit.yaml
index 034a4275b..b6403c659 100644
--- a/lmms_eval/tasks/ocrbench_suit_vd/ocrbench_suit.yaml
+++ b/lmms_eval/tasks/ocrbench_suit_vd/ocrbench_suit.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-selected-VD/ocrbench
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-selected-VD/ocrbench
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 dataset_kwargs:
   token: True
 task: "ocrbench_suit_vd"
diff --git a/lmms_eval/tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml b/lmms_eval/tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml
index 31b76aec1..da9dad0e7 100755
--- a/lmms_eval/tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml
+++ b/lmms_eval/tasks/ok_vqa_all_vd/ok_vqa_val2014.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-VD/OK-VQA/
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-VD/OK-VQA/
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 group: ok_vqa_vd
 task: ok_vqa_val2014_vd
 test_split: train
diff --git a/lmms_eval/tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml b/lmms_eval/tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml
index c2bdb617b..32902a0ef 100755
--- a/lmms_eval/tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml
+++ b/lmms_eval/tasks/ok_vqa_suit_vd/ok_vqa_val2014.yaml
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
 dataset_path: ./vlms-bench-data-selected-VD/OK-VQA/
-=======
-dataset_path: /ML-A100/team/mm/zk/lmms-eval/vlms-bench-data-selected-VD/OK-VQA/
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
 group: ok_vqa_suit_vd
 task: ok_vqa_suit_val2014_vd
 test_split: train
diff --git a/lmms_eval/tasks/siwei_bench_sub1/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub1/siwei_bench.yaml
deleted file mode 100755
index 3f4d60b3a..000000000
--- a/lmms_eval/tasks/siwei_bench_sub1/siwei_bench.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-group: siwei_bench_sub1
-task:
-- siwei_bench_layout
diff --git a/lmms_eval/tasks/siwei_bench_sub1/siwei_bench_layout.yaml b/lmms_eval/tasks/siwei_bench_sub1/siwei_bench_layout.yaml
deleted file mode 100644
index de87975fc..000000000
--- a/lmms_eval/tasks/siwei_bench_sub1/siwei_bench_layout.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Layout
-task: "siwei_bench_layout"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub1/utils.py b/lmms_eval/tasks/siwei_bench_sub1/utils.py
deleted file mode 100755
index 88c35ffbe..000000000
--- a/lmms_eval/tasks/siwei_bench_sub1/utils.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import json
-
-import re
-from collections import Counter
-from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-from PIL import Image
-import base64
-from io import BytesIO
-from loguru import logger
-import statistics
-
-PROMPT  = 'You will be giving one question, two images,  and four answers, one of them is correct. Please choose one of the three answers.\
-            please only answer the question with A, B, C.\
-            questions: {question} \
-            answer:  A: {A}  B: {B}  C: {C}\
-            Your answer is '
-
-
-def siwei_bench_doc_to_text(doc):
-    question=PROMPT.format(question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C'])
-    # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
-    # pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    # post_prompt = model_specific_prompt_kwargs["post_prompt"]
-    # return f"{pre_prompt}{question}{post_prompt}"
-    return question
-
-
-# def siwei_bench_doc_to_visual(doc):
-#     return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")]
-def base64_to_pil_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    
-    buffered = BytesIO(img_bytes)
-    
-    image = Image.open(buffered)
-    # image.save('temp.png')
-    return image
-
-def siwei_bench_doc_to_visual(doc):
-    # prompt = construct_prompt(doc)
-    # image_tokens = re.findall(r"<image \d+>", prompt)
-    # # Remove <> and  swap space as _
-    # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]
-    visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")]   ######################################### load image from base64 encoding
-    return visual
-
-
-def extract_option_labels(text, options=None):
-    if isinstance(text, dict):
-        return "error"
-    pattern = r"\(([A-C])\)"
-    matches = re.findall(pattern, text)
-
-    if not matches:
-        pattern = r"\b([A-C])\b"
-        matches = re.findall(pattern, text)
-
-    if matches:
-        counter = Counter(matches)
-        most_common = counter.most_common()
-        max_count = most_common[0][1]
-        candidates = [item for item in most_common if item[1] == max_count]
-        return candidates[-1][0]
-    else:
-        if options:
-            counter = Counter()
-            for i, option in enumerate(options, start=1):
-                label = chr(64 + i)
-                option_stripped = option.strip()
-                if option_stripped in text:
-                    counter[label] += 1
-                elif text in option:
-                    counter[label] += 1
-            if counter:
-                most_common = counter.most_common()
-                max_count = most_common[0][1]
-                candidates = [item for item in most_common if item[1] == max_count]
-                return candidates[-1][0]
-    return None
-
-
-def siwei_bench_process_results(doc, results):
-    response = results[0]
-    predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C']])
-    if doc['answer']==predict:
-        accuracy=1.0
-    else:
-        accuracy=0.0
-    return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}}
-
-
-def siwei_bench_aggregate_submissions(results, args):
-    file = generate_submission_file("siwei_bench_test_for_submission.json", args)
-    with open(file, "w") as f:
-        json.dump(results, f, indent=4)
-    logger.info(f"Results saved to {file}")
diff --git a/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench.yaml
deleted file mode 100755
index d817255bd..000000000
--- a/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-group: siwei_bench_sub1_vd
-task:
-- siwei_bench_layout_vd
diff --git a/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench_layout.yaml b/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench_layout.yaml
deleted file mode 100644
index d3abfaaab..000000000
--- a/lmms_eval/tasks/siwei_bench_sub1_vd/siwei_bench_layout.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Layout
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/Layout
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_layout_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub1_vd/utils.py b/lmms_eval/tasks/siwei_bench_sub1_vd/utils.py
deleted file mode 100755
index b5254dbb6..000000000
--- a/lmms_eval/tasks/siwei_bench_sub1_vd/utils.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import json
-
-import re
-from collections import Counter
-from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-from PIL import Image
-import base64
-from io import BytesIO
-from loguru import logger
-import statistics
-
-PROMPT  = 'You will be giving one question, two images , descriptions of each images, and three answers, one of them is correct. Please choose one of the three answers.\
-            please only answer the question with A, B, C.\
-            description of image1:{image1},description of image2:{image2},\
-            questions: {question} \
-            answer:  A: {A}  B: {B}  C: {C}\
-            Your answer is '
-
-
-def siwei_bench_doc_to_text(doc):
-    question=PROMPT.format(image1=doc['image1_VD'],image2=doc['image2_VD'],question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C'])
-    # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
-    # pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    # post_prompt = model_specific_prompt_kwargs["post_prompt"]
-    # return f"{pre_prompt}{question}{post_prompt}"
-    return question
-
-
-# def siwei_bench_doc_to_visual(doc):
-#     return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")]
-def base64_to_pil_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    
-    buffered = BytesIO(img_bytes)
-    
-    image = Image.open(buffered)
-    # image.save('temp.png')
-    return image
-
-def siwei_bench_doc_to_visual(doc):
-    # prompt = construct_prompt(doc)
-    # image_tokens = re.findall(r"<image \d+>", prompt)
-    # # Remove <> and  swap space as _
-    # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]
-    visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")]   ######################################### load image from base64 encoding
-    return visual
-
-
-def extract_option_labels(text, options=None):
-    if isinstance(text, dict):
-        return "error"
-    pattern = r"\(([A-C])\)"
-    matches = re.findall(pattern, text)
-
-    if not matches:
-        pattern = r"\b([A-C])\b"
-        matches = re.findall(pattern, text)
-
-    if matches:
-        counter = Counter(matches)
-        most_common = counter.most_common()
-        max_count = most_common[0][1]
-        candidates = [item for item in most_common if item[1] == max_count]
-        return candidates[-1][0]
-    else:
-        if options:
-            counter = Counter()
-            for i, option in enumerate(options, start=1):
-                label = chr(64 + i)
-                option_stripped = option.strip()
-                if option_stripped in text:
-                    counter[label] += 1
-                elif text in option:
-                    counter[label] += 1
-            if counter:
-                most_common = counter.most_common()
-                max_count = most_common[0][1]
-                candidates = [item for item in most_common if item[1] == max_count]
-                return candidates[-1][0]
-    return None
-
-
-def siwei_bench_process_results(doc, results):
-    response = results[0]
-    predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C']])
-    if doc['answer']==predict:
-        accuracy=1.0
-    else:
-        accuracy=0.0
-    return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}}
-
-
-def siwei_bench_aggregate_submissions(results, args):
-    file = generate_submission_file("siwei_bench_test_for_submission.json", args)
-    with open(file, "w") as f:
-        json.dump(results, f, indent=4)
-    logger.info(f"Results saved to {file}")
diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench.yaml
deleted file mode 100755
index 63776cc64..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-group: siwei_bench_sub2
-task:
-- siwei_bench_atlocation
-- siwei_bench_environment
-- siwei_bench_madeof
-- siwei_bench_nearby
-- siwei_bench_partof
-- siwei_bench_Used_For
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Environment.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Environment.yaml
deleted file mode 100755
index cddb5c637..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Environment.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Environment
-task: "siwei_bench_environment"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_MadeOf.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_MadeOf.yaml
deleted file mode 100755
index e7f9e5275..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_MadeOf.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/MadeOf
-task: "siwei_bench_madeof"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Used_For.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Used_For.yaml
deleted file mode 100755
index 6cb7d6fda..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_Used_For.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Used_For
-task: "siwei_bench_Used_For"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_atlocation.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_atlocation.yaml
deleted file mode 100755
index 5599f63c1..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_atlocation.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/AtLocation
-task: "siwei_bench_atlocation"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_nearby.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_nearby.yaml
deleted file mode 100755
index 3519d3d22..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_nearby.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/NearBy
-task: "siwei_bench_nearby"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_partof.yaml b/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_partof.yaml
deleted file mode 100755
index d0c9f4e41..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2/siwei_bench_partof.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/PartOf
-task: "siwei_bench_partof"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2/utils.py b/lmms_eval/tasks/siwei_bench_sub2/utils.py
deleted file mode 100755
index 1ad5e31cb..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2/utils.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import json
-
-import re
-from collections import Counter
-from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-from PIL import Image
-import base64
-from io import BytesIO
-from loguru import logger
-import statistics
-
-PROMPT = 'You will be giving one question, two images,  and four answers, one of them is correct. Please choose one of the four answers.\
-                    please only answer the question with A, B, C or D.\
-                    questions: {question} \
-                    answer:  A: {A}  B: {B}  C: {C}  D:{D}\
-                    Your answer is '
-
-
-def siwei_bench_doc_to_text(doc):
-    question=PROMPT.format(question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C'],D=doc['options']['D'])
-    # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
-    # pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    # post_prompt = model_specific_prompt_kwargs["post_prompt"]
-    # return f"{pre_prompt}{question}{post_prompt}"
-    return question
-
-
-# def siwei_bench_doc_to_visual(doc):
-#     return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")]
-def base64_to_pil_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    
-    buffered = BytesIO(img_bytes)
-    
-    image = Image.open(buffered)
-    # image.save('temp.png')
-    return image
-
-def siwei_bench_doc_to_visual(doc):
-    # prompt = construct_prompt(doc)
-    # image_tokens = re.findall(r"<image \d+>", prompt)
-    # # Remove <> and  swap space as _
-    # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]
-    visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")]   ######################################### load image from base64 encoding
-    return visual
-
-
-def extract_option_labels(text, options=None):
-    if isinstance(text, dict):
-        return "error"
-    pattern = r"\(([A-D])\)"
-    matches = re.findall(pattern, text)
-
-    if not matches:
-        pattern = r"\b([A-D])\b"
-        matches = re.findall(pattern, text)
-
-    if matches:
-        counter = Counter(matches)
-        most_common = counter.most_common()
-        max_count = most_common[0][1]
-        candidates = [item for item in most_common if item[1] == max_count]
-        return candidates[-1][0]
-    else:
-        if options:
-            counter = Counter()
-            for i, option in enumerate(options, start=1):
-                label = chr(64 + i)
-                option_stripped = option.strip()
-                if option_stripped in text:
-                    counter[label] += 1
-                elif text in option:
-                    counter[label] += 1
-            if counter:
-                most_common = counter.most_common()
-                max_count = most_common[0][1]
-                candidates = [item for item in most_common if item[1] == max_count]
-                return candidates[-1][0]
-    return None
-
-
-def siwei_bench_process_results(doc, results):
-    response = results[0]
-    predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']])
-    if doc['answer']==predict:
-        accuracy=1.0
-    else:
-        accuracy=0.0
-    return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}}
-
-
-def siwei_bench_aggregate_submissions(results, args):
-    file = generate_submission_file("siwei_bench_test_for_submission.json", args)
-    with open(file, "w") as f:
-        json.dump(results, f, indent=4)
-    logger.info(f"Results saved to {file}")
diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench.yaml
deleted file mode 100755
index 76edb2c46..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-group: siwei_bench_sub2_shuffle
-task:
-- siwei_bench_atlocation_shuffle
-- siwei_bench_madeof_shuffle
-- siwei_bench_nearby_shuffle
-- siwei_bench_partof_shuffle
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Environment.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Environment.yaml
deleted file mode 100755
index 022295afe..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Environment.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/Environment
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/Environment
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_environment"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_MadeOf.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_MadeOf.yaml
deleted file mode 100755
index aca4ddfb5..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_MadeOf.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/MadeOf
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/MadeOf
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_madeof_shuffle"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Used_For.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Used_For.yaml
deleted file mode 100755
index 7c6e6de9b..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_Used_For.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/Used_For
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/Used_For
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_Used_For"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_atlocation.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_atlocation.yaml
deleted file mode 100755
index f81efdf64..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_atlocation.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/AtLocation
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/AtLocation
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_atlocation_shuffle"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_nearby.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_nearby.yaml
deleted file mode 100755
index 22775fcab..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_nearby.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/NearBy
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/NearBy
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_nearby_shuffle"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_partof.yaml b/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_partof.yaml
deleted file mode 100755
index abe151c5a..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/siwei_bench_partof.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/PartOf
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/PartOf
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_partof_shuffle"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_shuffle/utils.py b/lmms_eval/tasks/siwei_bench_sub2_shuffle/utils.py
deleted file mode 100755
index 84d8a355a..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_shuffle/utils.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import json
-
-import re
-from collections import Counter
-from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-from PIL import Image
-import base64
-from io import BytesIO
-from loguru import logger
-import statistics
-
-PROMPT = 'You will be giving one question, two images,  and five answers, one of them is correct. Please choose one of the five answers.\
-                    please only answer the question with A, B, C, D or E.\
-                    questions: {question} \
-                    answer:  A: {A}  B: {B}  C: {C}  D:{D} E:{E}\
-                    Your answer is '
-
-
-def siwei_bench_doc_to_text(doc):
-    question=PROMPT.format(question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C'],D=doc['options']['D'],E=doc['options']['E'])
-    # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
-    # pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    # post_prompt = model_specific_prompt_kwargs["post_prompt"]
-    # return f"{pre_prompt}{question}{post_prompt}"
-    return question
-
-
-# def siwei_bench_doc_to_visual(doc):
-#     return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")]
-def base64_to_pil_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    
-    buffered = BytesIO(img_bytes)
-    
-    image = Image.open(buffered)
-    # image.save('temp.png')
-    return image
-
-def siwei_bench_doc_to_visual(doc):
-    # prompt = construct_prompt(doc)
-    # image_tokens = re.findall(r"<image \d+>", prompt)
-    # # Remove <> and  swap space as _
-    # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]
-    visual = [base64_to_pil_image(doc['image2']).convert("RGB"),base64_to_pil_image(doc['image1']).convert("RGB")]   ######################################### load image from base64 encoding
-    return visual
-
-
-def extract_option_labels(text, options=None):
-    if isinstance(text, dict):
-        return "error"
-    pattern = r"\(([A-E])\)"
-    matches = re.findall(pattern, text)
-
-    if not matches:
-        pattern = r"\b([A-E])\b"
-        matches = re.findall(pattern, text)
-
-    if matches:
-        counter = Counter(matches)
-        most_common = counter.most_common()
-        max_count = most_common[0][1]
-        candidates = [item for item in most_common if item[1] == max_count]
-        return candidates[-1][0]
-    else:
-        if options:
-            counter = Counter()
-            for i, option in enumerate(options, start=1):
-                label = chr(64 + i)
-                option_stripped = option.strip()
-                if option_stripped in text:
-                    counter[label] += 1
-                elif text in option:
-                    counter[label] += 1
-            if counter:
-                most_common = counter.most_common()
-                max_count = most_common[0][1]
-                candidates = [item for item in most_common if item[1] == max_count]
-                return candidates[-1][0]
-    return None
-
-
-def siwei_bench_process_results(doc, results):
-    response = results[0]
-    predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D'],doc['options']['E']])
-    if doc['last_answer']==predict:
-        accuracy=1.0
-    else:
-        accuracy=0.0
-    return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}}
-
-
-def siwei_bench_aggregate_submissions(results, args):
-    file = generate_submission_file("siwei_bench_test_for_submission.json", args)
-    with open(file, "w") as f:
-        json.dump(results, f, indent=4)
-    logger.info(f"Results saved to {file}")
diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench.yaml
deleted file mode 100755
index 337456876..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-group: siwei_bench_sub2_vd
-task:
-- siwei_bench_atlocation_vd
-- siwei_bench_environment_vd
-- siwei_bench_madeof_vd
-- siwei_bench_nearby_vd
-- siwei_bench_partof_vd
-- siwei_bench_Used_For_vd
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Environment.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Environment.yaml
deleted file mode 100755
index 003175b65..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Environment.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Environment
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/Environment
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_environment_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_MadeOf.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_MadeOf.yaml
deleted file mode 100755
index 2841565cf..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_MadeOf.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/MadeOf
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/MadeOf
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_madeof_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Used_For.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Used_For.yaml
deleted file mode 100755
index dda863758..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_Used_For.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/Used_For
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/Used_For
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_Used_For_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_atlocation.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_atlocation.yaml
deleted file mode 100755
index 7e044a54b..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_atlocation.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/AtLocation
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/AtLocation
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_atlocation_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_nearby.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_nearby.yaml
deleted file mode 100755
index 5f92e3a20..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_nearby.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/NearBy
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/NearBy
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_nearby_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_partof.yaml b/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_partof.yaml
deleted file mode 100755
index 689290d44..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_vd/siwei_bench_partof.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/PartOf
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/PartOf
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_partof_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  - metric: submission
-    aggregation: !function utils.siwei_bench_aggregate_submissions
-    higher_is_better: true
-# metric_list:
-
-#   - metric: submission
-#     aggregation: !function utils.siwei_bench_aggregate_submissions
-
diff --git a/lmms_eval/tasks/siwei_bench_sub2_vd/utils.py b/lmms_eval/tasks/siwei_bench_sub2_vd/utils.py
deleted file mode 100755
index e4db4ac1b..000000000
--- a/lmms_eval/tasks/siwei_bench_sub2_vd/utils.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import json
-
-import re
-from collections import Counter
-from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-from PIL import Image
-import base64
-from io import BytesIO
-from loguru import logger
-import statistics
-
-PROMPT = 'You will be giving one question, two images, descriptions of each images,  and four answers, one of them is correct. Please choose one of the four answers.\
-                    please only answer the question with A, B, C or D.\
-                    description of image1:{image1},description of image2:{image2},\
-                    questions: {question} \
-                    answer:  A: {A}  B: {B}  C: {C}  D:{D}\
-                    Your answer is '
-
-
-def siwei_bench_doc_to_text(doc):
-    question=PROMPT.format(image1=doc['image1_VD'],image2=doc['image2_VD'],question=doc['question'],A=doc['options']['A'], B=doc['options']['B'], C=doc['options']['C'],D=doc['options']['D'])
-    # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
-    # pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    # post_prompt = model_specific_prompt_kwargs["post_prompt"]
-    # return f"{pre_prompt}{question}{post_prompt}"
-    return question
-
-
-# def siwei_bench_doc_to_visual(doc):
-#     return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")]
-def base64_to_pil_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    
-    buffered = BytesIO(img_bytes)
-    
-    image = Image.open(buffered)
-    # image.save('temp.png')
-    return image
-
-def siwei_bench_doc_to_visual(doc):
-    # prompt = construct_prompt(doc)
-    # image_tokens = re.findall(r"<image \d+>", prompt)
-    # # Remove <> and  swap space as _
-    # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]
-    visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")]   ######################################### load image from base64 encoding
-    return visual
-
-
-def extract_option_labels(text, options=None):
-    if isinstance(text, dict):
-        return "error"
-    pattern = r"\(([A-D])\)"
-    matches = re.findall(pattern, text)
-
-    if not matches:
-        pattern = r"\b([A-D])\b"
-        matches = re.findall(pattern, text)
-
-    if matches:
-        counter = Counter(matches)
-        most_common = counter.most_common()
-        max_count = most_common[0][1]
-        candidates = [item for item in most_common if item[1] == max_count]
-        return candidates[-1][0]
-    else:
-        if options:
-            counter = Counter()
-            for i, option in enumerate(options, start=1):
-                label = chr(64 + i)
-                option_stripped = option.strip()
-                if option_stripped in text:
-                    counter[label] += 1
-                elif text in option:
-                    counter[label] += 1
-            if counter:
-                most_common = counter.most_common()
-                max_count = most_common[0][1]
-                candidates = [item for item in most_common if item[1] == max_count]
-                return candidates[-1][0]
-    return None
-
-
-def siwei_bench_process_results(doc, results):
-    response = results[0]
-    predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']])
-    if doc['answer']==predict:
-        accuracy=1.0
-    else:
-        accuracy=0.0
-    return {"exact_match": accuracy,"submission": {"id": doc["idx"], "predict_answer": predict, "response": response}}
-
-
-def siwei_bench_aggregate_submissions(results, args):
-    file = generate_submission_file("siwei_bench_test_for_submission.json", args)
-    with open(file, "w") as f:
-        json.dump(results, f, indent=4)
-    logger.info(f"Results saved to {file}")
diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench.yaml
deleted file mode 100755
index e48c71e16..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-group: siwei_bench_sub3
-task:
-- siwei_bench_shapesimilarto
-- siwei_bench_subevent
-- siwei_bench_similar_event
-- siwei_bench_hasproperty
-
diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_hasproperty.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_hasproperty.yaml
deleted file mode 100644
index d79677074..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_hasproperty.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty
-task: "siwei_bench_hasproperty"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 128
-  temperature: 0
-  top_p: 1.0
-  num_beams: 1
-  do_sample: false
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  # - metric: submission
-  #   aggregation: !function utils.siwei_bench_aggregate_submissions
-  #   higher_is_better: true
-  # - metric: siwei_bench_precision
-  #   aggregation: !function utils.siwei_bench_aggregate_precision
-  #   higher_is_better: true
-  # - metric: siwei_bench_recall
-  #   aggregation: !function utils.siwei_bench_aggregate_recall
-  #   higher_is_better: true
-  # - metric: siwei_bench_f1_score
-  #   aggregation: !function utils.siwei_bench_aggregate_f1_score
-  #   higher_is_better: true
-  # - metric: siwei_bench_yes_ratio
-  #   aggregation: !function utils.siwei_bench_aggregate_yes_ratio
-  #   higher_is_better: true
-# metric_list:
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_shapesimilarto.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_shapesimilarto.yaml
deleted file mode 100755
index 2373c9b83..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_shapesimilarto.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/ShapeSimilarTo
-task: "siwei_bench_shapesimilarto"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 128
-  temperature: 0
-  top_p: 1.0
-  num_beams: 1
-  do_sample: false
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_similar_event.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_similar_event.yaml
deleted file mode 100644
index 95a79417c..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_similar_event.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent
-task: "siwei_bench_similar_event"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 128
-  temperature: 0
-  top_p: 1.0
-  num_beams: 1
-  do_sample: false
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_subevent.yaml b/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_subevent.yaml
deleted file mode 100644
index 3d12437a3..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3/siwei_bench_subevent.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SubEvent
-task: "siwei_bench_subevent"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 128
-  temperature: 0
-  top_p: 1.0
-  num_beams: 1
-  do_sample: false
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3/utils.py b/lmms_eval/tasks/siwei_bench_sub3/utils.py
deleted file mode 100755
index abbd6d187..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3/utils.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import json
-
-import re
-from collections import Counter
-from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-from PIL import Image
-import base64
-from io import BytesIO
-from loguru import logger
-import statistics
-import re
-
-def remove_punctuation(text):
-    # 定义正则表达式模式以匹配所有标点符号
-    pattern = r'[^\w\s]'
-    # 使用正则表达式替换标点符号为空字符串
-    return re.sub(pattern, '', text)
-
-PROMPT   = 'You will be giving one question and two images. Please answer the question using "Yes" or "No". \
-                  Please only answer the question with Yes or No.\
-                  questions: {question} \
-                  Your answer is '
-
-
-def siwei_bench_doc_to_text(doc):
-    question=PROMPT.format(question=doc['question'])
-    # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
-    # pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    # post_prompt = model_specific_prompt_kwargs["post_prompt"]
-    # return f"{pre_prompt}{question}{post_prompt}"
-    return question
-
-
-# def siwei_bench_doc_to_visual(doc):
-#     return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")]
-def base64_to_pil_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    
-    buffered = BytesIO(img_bytes)
-    
-    image = Image.open(buffered)
-    # image.save('temp.png')
-    return image
-
-def siwei_bench_doc_to_visual(doc):
-    # prompt = construct_prompt(doc)
-    # image_tokens = re.findall(r"<image \d+>", prompt)
-    # # Remove <> and  swap space as _
-    # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]
-    visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")]   ######################################### load image from base64 encoding
-    return visual
-def extract_yes_no(response):
-    # 定义正则表达式模式，匹配 "yes" 或 "no"
-    pattern = r'\b(yes|no)\b'
-    # 使用正则表达式搜索response中的匹配项
-    matches = re.findall(pattern, response, re.IGNORECASE)
-    if matches:
-        counter = Counter(matches)
-        most_common = counter.most_common()
-        max_count = most_common[0][1]
-        candidates = [item for item in most_common if item[1] == max_count]
-        return candidates[-1][0]
-    # 返回匹配项列表
-    return None
-    
-
-
-def siwei_bench_process_results(doc, results):
-    response = remove_punctuation(results[0])
-    pred = response.lower().strip()
-    gt_ans = doc["answer"].lower().strip()
-    # idx=doc["idx"]
-    assert gt_ans in ["yes", "no"]
-    if pred not in ["yes", "no"]:
-        pred=extract_yes_no(pred)
-    score = 1.0 if pred == gt_ans else 0.0
-    # predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']])
-    # if doc['answer']==predict:
-    #     accuracy=1.0
-    # else:
-    #     accuracy=0.0
-    return {"exact_match": score,"submission": {"id": doc["idx"], "predict_answer": pred, "response": response}}
-
-
-# def siwei_bench_aggregate_accuracy(results):
-#     total_score = 0
-#     for result in results:
-#         total_score += result["score"]
-#     avg_score = total_score / len(results)
-#     return avg_score
-
-
-# def siwei_bench_aggregate_precision(results):
-#     true_positives = 0
-#     false_positives = 0
-#     for result in results:
-#         pred = result["prediction"]
-#         gt = result["ground_truth"]
-#         if gt == "yes" and pred == "yes":
-#             true_positives += 1
-#         elif gt == "no" and pred == "yes":
-#             false_positives += 1
-#     precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
-#     return precision
-
-
-# def siwei_bench_aggregate_recall(results):
-#     true_positives = 0
-#     false_negatives = 0
-#     for result in results:
-#         pred = result["prediction"]
-#         gt = result["ground_truth"]
-#         if gt == "yes" and pred == "yes":
-#             true_positives += 1
-#         elif gt == "yes" and pred == "no":
-#             false_negatives += 1
-#     recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
-#     return recall
-
-
-# def siwei_bench_aggregate_f1_score(results):
-#     precision = pope_aggregate_precision(results)
-#     recall = pope_aggregate_recall(results)
-#     f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
-#     return f1_score
-
-
-# def siwei_bench_aggregate_yes_ratio(results):
-#     yes_count = 0
-#     no_count = 0
-#     for result in results:
-#         gt = result["ground_truth"]
-#         if gt == "yes":
-#             yes_count += 1
-#         elif gt == "no":
-#             no_count += 1
-#     yes_ratio = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0
-#     return yes_ratio
diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench.yaml
deleted file mode 100755
index 6581d4f02..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-group: siwei_bench_sub3_shuffle
-task:
-- siwei_bench_shapesimilarto_shuffle
-- siwei_bench_subevent_shuffle
-
diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_hasproperty.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_hasproperty.yaml
deleted file mode 100644
index c128da9c7..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_hasproperty.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_hasproperty"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  # - metric: submission
-  #   aggregation: !function utils.siwei_bench_aggregate_submissions
-  #   higher_is_better: true
-  # - metric: siwei_bench_precision
-  #   aggregation: !function utils.siwei_bench_aggregate_precision
-  #   higher_is_better: true
-  # - metric: siwei_bench_recall
-  #   aggregation: !function utils.siwei_bench_aggregate_recall
-  #   higher_is_better: true
-  # - metric: siwei_bench_f1_score
-  #   aggregation: !function utils.siwei_bench_aggregate_f1_score
-  #   higher_is_better: true
-  # - metric: siwei_bench_yes_ratio
-  #   aggregation: !function utils.siwei_bench_aggregate_yes_ratio
-  #   higher_is_better: true
-# metric_list:
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_shapesimilarto.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_shapesimilarto.yaml
deleted file mode 100755
index dbc4762dd..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_shapesimilarto.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/ShapeSimilarTo
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/ShapeSimilarTo
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_shapesimilarto_shuffle"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_similar_event.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_similar_event.yaml
deleted file mode 100644
index d0abdad5c..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_similar_event.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_similar_event"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_subevent.yaml b/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_subevent.yaml
deleted file mode 100644
index 461fbe6bb..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/siwei_bench_subevent.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/SubEvent
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf_img_2_1/SubEvent
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_subevent_shuffle"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 128
-  temperature: 0
-  top_p: 1.0
-  num_beams: 1
-  do_sample: false
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3_shuffle/utils.py b/lmms_eval/tasks/siwei_bench_sub3_shuffle/utils.py
deleted file mode 100755
index 6f528feb3..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_shuffle/utils.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import json
-
-import re
-from collections import Counter
-from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-from PIL import Image
-import base64
-from io import BytesIO
-from loguru import logger
-import statistics
-import re
-
-def remove_punctuation(text):
-    # 定义正则表达式模式以匹配所有标点符号
-    pattern = r'[^\w\s]'
-    # 使用正则表达式替换标点符号为空字符串
-    return re.sub(pattern, '', text)
-
-PROMPT   = 'You will be giving one question and two images. Please answer the question using "Yes" or "No". \
-                  Please only answer the question with Yes or No.\
-                  questions: {question} \
-                  Your answer is '
-
-
-def siwei_bench_doc_to_text(doc):
-    question=PROMPT.format(question=doc['question'])
-    # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
-    # pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    # post_prompt = model_specific_prompt_kwargs["post_prompt"]
-    # return f"{pre_prompt}{question}{post_prompt}"
-    return question
-
-
-# def siwei_bench_doc_to_visual(doc):
-#     return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")]
-def base64_to_pil_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    
-    buffered = BytesIO(img_bytes)
-    
-    image = Image.open(buffered)
-    # image.save('temp.png')
-    return image
-
-def siwei_bench_doc_to_visual(doc):
-    # prompt = construct_prompt(doc)
-    # image_tokens = re.findall(r"<image \d+>", prompt)
-    # # Remove <> and  swap space as _
-    # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]
-    visual = [base64_to_pil_image(doc['image2']).convert("RGB"),base64_to_pil_image(doc['image1']).convert("RGB")]   ######################################### load image from base64 encoding
-    return visual
-def extract_yes_no(response):
-    # 定义正则表达式模式，匹配 "yes" 或 "no"
-    pattern = r'\b(yes|no)\b'
-    # 使用正则表达式搜索response中的匹配项
-    matches = re.findall(pattern, response, re.IGNORECASE)
-    if matches:
-        counter = Counter(matches)
-        most_common = counter.most_common()
-        max_count = most_common[0][1]
-        candidates = [item for item in most_common if item[1] == max_count]
-        return candidates[-1][0]
-    # 返回匹配项列表
-    return None
-    
-
-
-def siwei_bench_process_results(doc, results):
-    response = remove_punctuation(results[0])
-    pred = response.lower().strip()
-    gt_ans = doc["last_answer"].lower().strip()
-    # idx=doc["idx"]
-    assert gt_ans in ["yes", "no"]
-    if pred not in ["yes", "no"]:
-        pred=extract_yes_no(pred)
-    score = 1.0 if pred == gt_ans else 0.0
-    # predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']])
-    # if doc['answer']==predict:
-    #     accuracy=1.0
-    # else:
-    #     accuracy=0.0
-    return {"exact_match": score,"submission": {"id": doc["idx"], "predict_answer": pred, "response": response}}
-
-
-# def siwei_bench_aggregate_accuracy(results):
-#     total_score = 0
-#     for result in results:
-#         total_score += result["score"]
-#     avg_score = total_score / len(results)
-#     return avg_score
-
-
-# def siwei_bench_aggregate_precision(results):
-#     true_positives = 0
-#     false_positives = 0
-#     for result in results:
-#         pred = result["prediction"]
-#         gt = result["ground_truth"]
-#         if gt == "yes" and pred == "yes":
-#             true_positives += 1
-#         elif gt == "no" and pred == "yes":
-#             false_positives += 1
-#     precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
-#     return precision
-
-
-# def siwei_bench_aggregate_recall(results):
-#     true_positives = 0
-#     false_negatives = 0
-#     for result in results:
-#         pred = result["prediction"]
-#         gt = result["ground_truth"]
-#         if gt == "yes" and pred == "yes":
-#             true_positives += 1
-#         elif gt == "yes" and pred == "no":
-#             false_negatives += 1
-#     recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
-#     return recall
-
-
-# def siwei_bench_aggregate_f1_score(results):
-#     precision = pope_aggregate_precision(results)
-#     recall = pope_aggregate_recall(results)
-#     f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
-#     return f1_score
-
-
-# def siwei_bench_aggregate_yes_ratio(results):
-#     yes_count = 0
-#     no_count = 0
-#     for result in results:
-#         gt = result["ground_truth"]
-#         if gt == "yes":
-#             yes_count += 1
-#         elif gt == "no":
-#             no_count += 1
-#     yes_ratio = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0
-#     return yes_ratio
diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench.yaml
deleted file mode 100755
index 2d2bf6c2e..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-group: siwei_bench_sub3_vd
-task:
-- siwei_bench_shapesimilarto_vd
-- siwei_bench_subevent_vd
-- siwei_bench_similar_event_vd
-- siwei_bench_hasproperty_vd
-
diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_hasproperty.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_hasproperty.yaml
deleted file mode 100644
index 813fdfa34..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_hasproperty.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/HasProperty
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_hasproperty_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-  # - metric: submission
-  #   aggregation: !function utils.siwei_bench_aggregate_submissions
-  #   higher_is_better: true
-  # - metric: siwei_bench_precision
-  #   aggregation: !function utils.siwei_bench_aggregate_precision
-  #   higher_is_better: true
-  # - metric: siwei_bench_recall
-  #   aggregation: !function utils.siwei_bench_aggregate_recall
-  #   higher_is_better: true
-  # - metric: siwei_bench_f1_score
-  #   aggregation: !function utils.siwei_bench_aggregate_f1_score
-  #   higher_is_better: true
-  # - metric: siwei_bench_yes_ratio
-  #   aggregation: !function utils.siwei_bench_aggregate_yes_ratio
-  #   higher_is_better: true
-# metric_list:
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_shapesimilarto.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_shapesimilarto.yaml
deleted file mode 100755
index a7ac8049d..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_shapesimilarto.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/ShapeSimilarTo
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/ShapeSimilarTo
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_shapesimilarto_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_similar_event.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_similar_event.yaml
deleted file mode 100644
index bb6c0fbbc..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_similar_event.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/SimilarEvent
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_similar_event_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 32
-  temperature: 0
-  do_sample: False
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_subevent.yaml b/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_subevent.yaml
deleted file mode 100644
index 8d4535189..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_vd/siwei_bench_subevent.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-<<<<<<< HEAD
-dataset_path: /xpfs/public/gezhang/zk/MMIR_codebase/Mantis/final_bench_hf/SubEvent
-=======
-dataset_path: /ML-A100/team/mm/zk/MMIR_codebase/Mantis/final_bench_hf/SubEvent
->>>>>>> 865c7069caf994108f2fb1c2648cb346c8741a4e
-task: "siwei_bench_subevent_vd"
-test_split: train
-output_type: generate_until
-doc_to_visual: !function utils.siwei_bench_doc_to_visual
-doc_to_text: !function utils.siwei_bench_doc_to_text
-doc_to_target: "answer"
-generation_kwargs:
-  max_new_tokens: 128
-  temperature: 0
-  top_p: 1.0
-  num_beams: 1
-  do_sample: false
-process_results: !function utils.siwei_bench_process_results
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/siwei_bench_sub3_vd/utils.py b/lmms_eval/tasks/siwei_bench_sub3_vd/utils.py
deleted file mode 100755
index 14928b7cc..000000000
--- a/lmms_eval/tasks/siwei_bench_sub3_vd/utils.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import json
-
-import re
-from collections import Counter
-from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-from PIL import Image
-import base64
-from io import BytesIO
-from loguru import logger
-import statistics
-import re
-
-def remove_punctuation(text):
-    # 定义正则表达式模式以匹配所有标点符号
-    pattern = r'[^\w\s]'
-    # 使用正则表达式替换标点符号为空字符串
-    return re.sub(pattern, '', text)
-
-PROMPT   = 'You will be giving one question , two images and descriptions of each images, Please answer the question using "Yes" or "No". \
-                  Please only answer the question with Yes or No.\
-                  description of image1:{image1},description of image2:{image2},\
-                  questions: {question} \
-                  Your answer is '
-
-
-def siwei_bench_doc_to_text(doc):
-    question=PROMPT.format(image1=doc['image1_VD'],image2=doc['image2_VD'],question=doc['question'])
-    # question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
-    # pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    # post_prompt = model_specific_prompt_kwargs["post_prompt"]
-    # return f"{pre_prompt}{question}{post_prompt}"
-    return question
-
-
-# def siwei_bench_doc_to_visual(doc):
-#     return [doc["image1"].convert("RGB"),doc["image2"].convert("RGB")]
-def base64_to_pil_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    
-    buffered = BytesIO(img_bytes)
-    
-    image = Image.open(buffered)
-    # image.save('temp.png')
-    return image
-
-def siwei_bench_doc_to_visual(doc):
-    # prompt = construct_prompt(doc)
-    # image_tokens = re.findall(r"<image \d+>", prompt)
-    # # Remove <> and  swap space as _
-    # image_tokens = [image_token.strip("<>").replace(" ", "_") for image_token in image_tokens]
-    visual = [base64_to_pil_image(doc['image1']).convert("RGB"),base64_to_pil_image(doc['image2']).convert("RGB")]   ######################################### load image from base64 encoding
-    return visual
-def extract_yes_no(response):
-    # 定义正则表达式模式，匹配 "yes" 或 "no"
-    pattern = r'\b(yes|no)\b'
-    # 使用正则表达式搜索response中的匹配项
-    matches = re.findall(pattern, response, re.IGNORECASE)
-    if matches:
-        counter = Counter(matches)
-        most_common = counter.most_common()
-        max_count = most_common[0][1]
-        candidates = [item for item in most_common if item[1] == max_count]
-        return candidates[-1][0]
-    
-    # 返回匹配项列表
-    return None
-    
-
-
-def siwei_bench_process_results(doc, results):
-    response = remove_punctuation(results[0])
-    pred = response.lower().strip()
-    gt_ans = doc["answer"].lower().strip()
-    # idx=doc["idx"]
-    assert gt_ans in ["yes", "no"]
-    if pred not in ["yes", "no"]:
-        pred=extract_yes_no(pred)
-    score = 1.0 if pred == gt_ans else 0.0
-    # predict = extract_option_labels(response, [doc['options']['A'], doc['options']['B'], doc['options']['C'], doc['options']['D']])
-    # if doc['answer']==predict:
-    #     accuracy=1.0
-    # else:
-    #     accuracy=0.0
-    return {"exact_match": score,"submission": {"id": doc["idx"], "predict_answer": pred, "response": response}}
-
-
-# def siwei_bench_aggregate_accuracy(results):
-#     total_score = 0
-#     for result in results:
-#         total_score += result["score"]
-#     avg_score = total_score / len(results)
-#     return avg_score
-
-
-# def siwei_bench_aggregate_precision(results):
-#     true_positives = 0
-#     false_positives = 0
-#     for result in results:
-#         pred = result["prediction"]
-#         gt = result["ground_truth"]
-#         if gt == "yes" and pred == "yes":
-#             true_positives += 1
-#         elif gt == "no" and pred == "yes":
-#             false_positives += 1
-#     precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
-#     return precision
-
-
-# def siwei_bench_aggregate_recall(results):
-#     true_positives = 0
-#     false_negatives = 0
-#     for result in results:
-#         pred = result["prediction"]
-#         gt = result["ground_truth"]
-#         if gt == "yes" and pred == "yes":
-#             true_positives += 1
-#         elif gt == "yes" and pred == "no":
-#             false_negatives += 1
-#     recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
-#     return recall
-
-
-# def siwei_bench_aggregate_f1_score(results):
-#     precision = pope_aggregate_precision(results)
-#     recall = pope_aggregate_recall(results)
-#     f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
-#     return f1_score
-
-
-# def siwei_bench_aggregate_yes_ratio(results):
-#     yes_count = 0
-#     no_count = 0
-#     for result in results:
-#         gt = result["ground_truth"]
-#         if gt == "yes":
-#             yes_count += 1
-#         elif gt == "no":
-#             no_count += 1
-#     yes_ratio = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0
-#     return yes_ratio
diff --git a/test.py b/test.py
deleted file mode 100644
index 58cdc3aa9..000000000
--- a/test.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import json
-with open('/ML-A100/team/mm/zhangge/domain_data_pipeline/llm_label_data_pipeline/fasttext_seed_data/chemistry/pos/pos.jsonl') as jsonl_file:
-    for line in jsonl_file:
-        data=json.loads(line)
-        print(data)
-        break
\ No newline at end of file
diff --git a/test_blip.py b/test_blip.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/test_cambrian.py b/test_cambrian.py
deleted file mode 100644
index d8409d25b..000000000
--- a/test_cambrian.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from lmms_eval.models.cambrian_8b import *
-from cambrian.model.builder import load_pretrained_model
-from cambrian.conversation import conv_templates, SeparatorStyle
-from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
-pretrained='/ML-A100/team/mm/zhangge/models/cambrian_8b'
-model_name = get_model_name_from_path(pretrained)
-print(model_name)
-tokenizer, model, image_processor, context_len = load_pretrained_model(pretrained, None, model_name)
\ No newline at end of file
diff --git a/visual_code/echart.js b/visual_code/echart.js
new file mode 100644
index 000000000..05e06c29a
--- /dev/null
+++ b/visual_code/echart.js
@@ -0,0 +1,145 @@
+var data = [
+    {
+    "name": "Descriptive",
+    "children": [
+      {
+        "name": "General Description",
+        "value": 142
+      },
+      {
+        "name": "Activity Description",
+        "value": 1
+      }
+    ]
+  },
+  {
+    "name": "Analytical",
+    "children": [
+      {
+        "name": "Data Analysis",
+        "value": 197
+      },
+      {
+        "name": "Cultural Analysis",
+        "value": 2
+      },
+      {
+        "name": "Analytical - Data Analysis",
+        "value": 1
+      },
+      {
+        "name": "Attribute-based Question Answer",
+        "value": 2
+      }
+    ]
+  },
+  {
+    "name": "Recognition",
+    "children": [
+      {
+        "name": "Object Recognition",
+        "value": 339
+      },
+      {
+        "name": "Location Identification",
+        "value": 2
+      },
+      {
+        "name": "Text Recognition",
+        "value": 89
+      },
+ 1
+    ]
+  },
+  {
+    "name": "Instructive",
+    "children": [
+      {
+        "name": "How-to Guides",
+        "value": 16
+      }
+    ]
+  },
+  {
+    "name": "Comprehensive",
+    "children": [
+      {
+        "name": "Cultural Analysis",
+        "value": 7
+      }
+    ]
+  },
+  {
+    "name": "Question",
+    "children": [
+      {
+        "name": "Recognition",
+        "value": 1
+      }
+    ]
+  },
+  {
+    "name": "Instructional",
+    "children": [
+      {
+        "name": "Math Problem Solving",
+        "value": 1
+      }
+    ]
+  }
+];
+option = {
+  title: {
+    text: 'WORLD COFFEE RESEARCH SENSORY LEXICON',
+    subtext: 'Source: https://worldcoffeeresearch.org/work/sensory-lexicon/',
+    textStyle: {
+      fontSize: 10,
+      align: 'center'
+    },
+    subtextStyle: {
+      align: 'center'
+    },
+    sublink: 'https://worldcoffeeresearch.org/work/sensory-lexicon/'
+  },
+  series: {
+    type: 'sunburst',
+    data: data,
+    radius: [0, '95%'],
+    sort: undefined,
+    emphasis: {
+      focus: 'ancestor'
+    },
+    levels: [
+      {},
+      {
+        r0: '15%',
+        r: '35%',
+        itemStyle: {
+          borderWidth: 2
+        },
+        label: {
+          rotate: 'tangential'
+        }
+      },
+      {
+        r0: '35%',
+        r: '70%',
+        label: {
+          align: 'right'
+        }
+      },
+      {
+        r0: '70%',
+        r: '72%',
+        label: {
+          position: 'outside',
+          padding: 3,
+          silent: false
+        },
+        itemStyle: {
+          borderWidth: 3
+        }
+      }
+    ]
+  }
+};
\ No newline at end of file
diff --git a/visual_code/echart_new.js b/visual_code/echart_new.js
new file mode 100644
index 000000000..18c7d5294
--- /dev/null
+++ b/visual_code/echart_new.js
@@ -0,0 +1,283 @@
+var data = [
+  {
+    "name": "Descriptive",
+    "itemStyle": {
+      "color": 'rgba(44, 157, 143)'  // 基准颜色：蓝色
+    },
+    "label": {
+      "fontSize": 16 // 设置字体大小
+    },
+    "children": [
+      {
+        "name": "General Description",
+        "value": 146,
+        "itemStyle": {
+          "color": 'rgba(108, 186, 177)' // 渐变：较深的颜色
+
+        },
+        "label": {
+          'position': 'inside',
+          'rotate': 'tangential',
+          "fontSize": 16, // 设置字体大小
+          "color": '#FFFFFF'  // 设置字体颜色为白色
+
+        }
+      },
+      {
+        "name": "Activity Description",
+        "value": 6,
+        "itemStyle": {
+          "color":'rgba(108, 186, 177)',// 渐变：较浅的颜色
+        },
+        label: {
+      align: 'right',
+      "fontSize": 10,
+      "color": '#FFFFFF'  // 设置字体颜色为白色
+
+    },
+      }
+    ]
+  },
+  {
+    "name": "Analytical",
+    "itemStyle": {
+      "color": 'rgba(243, 162, 98)' // 基准颜色：蓝绿色
+    },
+    "label": {
+      "fontSize": 16 // 设置字体大小
+    },
+    "children": [
+      {
+        "name": "Data Analysis",
+        "value": 197,
+        "itemStyle": {
+          "color": 'rgba(246, 190, 146)'  // 较深的蓝绿色
+        },
+        "label": {
+          'position': 'inside',
+          'rotate': 'tangential',
+          "fontSize": 16 // 设置字体大小
+        }
+      },
+      {
+        "name": "Cultural Analysis",
+        "value": 6,
+        "itemStyle": {
+          "color": 'rgba(246, 190, 146)'  // 较浅的蓝绿色
+        },
+        label: {
+          align: 'right',
+          "fontSize": 8,
+        },
+      },
+      {
+        "name": "Analytical - Data Analysis",
+        "value": 7,
+        "itemStyle": {
+          "color": 'rgba(246, 190, 146)'  // 更浅的蓝绿色
+        },
+        label: {
+          align: 'right',
+          "fontSize": 8,
+        },
+      },
+      {
+        "name": "Attribute-based Question Answer",
+        "value": 6,
+        "itemStyle": {
+          "color": 'rgba(246, 190, 146)'  // 最浅的蓝绿色
+        },
+        label: {
+      align: 'right',
+      "fontSize": 8,
+    },
+      }
+    ]
+  },
+  {
+    "name": "Recognition",
+    "itemStyle": {
+      "color":'rgba(232, 195, 107)'  // 基准颜色：绿色
+    },
+    "label": {
+      "fontSize": 16 // 设置字体大小
+    },
+    "children": [
+      {
+        "name": "Object Recognition",
+        "value": 339,
+        "itemStyle": {
+          "color": 'rgba(237, 213, 151)'  // 较深的绿色
+        },
+        "label": {
+          'position': 'inside',
+          'rotate': 'tangential',
+          "fontSize": 16 // 设置字体大小
+        }
+      },
+      {
+        "name": "Location Identification",
+        "value": 6,
+        "itemStyle": {
+          "color": 'rgba(237, 213, 151)'  // 较浅的绿色
+        },
+        label: {
+        align: 'right',
+        "fontSize": 10,
+      },
+      },
+      {
+        "name": "Text Recognition",
+        "value": 89,
+        "itemStyle": {
+          "color": 'rgba(237, 213, 151)'  // 更浅的绿色
+        },
+        "label": {
+          'position': 'inside',
+          'rotate': 'tangential',
+          "fontSize": 16 // 设置字体大小
+        }
+      },
+      {
+        "name": "Other",
+        "value": 15,
+        "itemStyle": {
+          "color": 'rgba(237, 213, 151)'  // 最浅的绿色
+        },
+        label: {
+        align: 'right',
+        "fontSize": 16,
+      },
+      }
+    ]
+  },
+  {
+    "name": "Instructive",
+    "itemStyle": {
+      "color": '#AEC48F'  // 基准颜色：浅绿色
+    },
+    label: {
+      align: 'right'
+    },
+    "children": [
+      {
+        "name": "How-to Guides",
+        "value": 16,
+        "itemStyle": {
+          "color": 'rgba(176, 196, 145, 0.8)'  // 较深的浅绿色
+        },
+        label: {
+      align: 'right',
+      "fontSize": 16
+    },
+      }
+    ]
+  },
+  {
+    "name": "Comprehensive",
+    "itemStyle": {
+      "color": '#FFDB5C'  // 基准颜色：黄色
+    },
+    label: {
+      align: 'right',
+      "fontSize": 9
+    },
+    "children": [
+      {
+        "name": "Cultural Analysis",
+        "value": 7,
+        "itemStyle": {
+          "color": 'rgba(255, 219, 96, 0.8)'  // 较深的黄色
+        },
+        label: {
+        align: 'right',
+        "fontSize": 14
+      },
+      }
+    ]
+  },
+  {
+    "name": "Question",
+    "itemStyle": {
+      "color": '#F98862'  // 基准颜色：橙色
+    },
+    label: {
+      align: 'right',
+      "fontSize": 8
+    },
+    
+    "children": [
+      {
+        "name": "Recognition",
+        "value": 6,
+        "itemStyle": {
+          "color": 'rgba(249, 136, 98, 0.8)'  // 渐变橙色
+        },
+        label: {
+        align: 'right',
+        "fontSize": 12
+      },
+      }
+    ]
+  },
+  {
+    "name": "Instructional",
+    "itemStyle": {
+      "color": '#E84A5F'  // 基准颜色：红色
+    },
+    label: {
+      align: 'right',
+      "fontSize": 8,
+      "color": '#000000'  // 设置字体颜色为黑色
+
+    },
+    "children": [
+      {
+        "name": "Math Problem Solving",
+        "value": 6,
+        "itemStyle": {
+          "color": 'rgba(236, 76, 95, 0.8)'  // 渐变红色
+        },
+        label: {
+          align: 'right',
+          "fontSize": 8,
+          "color": '#000000'  // 设置字体颜色为黑色
+
+        },
+      }
+    ]
+  }
+];
+
+option = {
+  series: {
+    type: 'sunburst',
+    data: data,
+    radius: [0, '100%'],
+    label: {
+          rotate: 'radial',
+          fontSize: 12,  // 统一字体大小
+          fontFamily: 'Arial',  // 统一字体样式
+      },
+    levels: [
+      {},
+      {
+        r0: '0%',
+        r: '50%',
+        itemStyle: {
+          borderWidth: 2
+        },
+        label: {
+          // align: 'right'
+        }
+      },
+      {
+        r0: '50%',
+        r: '100%',
+        label: {
+          // align: 'right'
+        }
+      }
+    ]
+  }
+};
\ No newline at end of file
diff --git a/visual_code/plot_map.py b/visual_code/plot_map.py
new file mode 100644
index 000000000..5b539b2dd
--- /dev/null
+++ b/visual_code/plot_map.py
@@ -0,0 +1,108 @@
+import json
+from collections import defaultdict
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import matplotlib.pyplot as plt
+from matplotlib import cm
+from matplotlib.colors import LinearSegmentedColormap
+# 加载 JSON 数据
+with open('/gpfs/public/research/zk/lmms-eval/result/category_result.json', 'r') as f:
+    data = json.load(f)
+curve_color = (128/255, 0/255, 128/255) 
+# 构建存储分类和子分类的字典
+category_data = defaultdict(lambda: defaultdict(int))
+
+# 遍历数据，统计 question_category 和 question_subcategory 的分布
+for entry in data:
+    category = entry["question_category"]
+    subcategory = entry["question_subcategory"]
+    category_data[category][subcategory] += 1
+
+# Flattening the two-dimensional dictionary to a list of tuples (category, subcategory, count)
+flat_data = []
+for category, subcategories in category_data.items():
+    for subcategory, count in subcategories.items():
+        if '(' in subcategory:
+            subcategory=subcategory.split('(')[0]
+        flat_data.append((category, subcategory, count))
+
+# Sorting the flat data by count in descending order
+flat_data_sorted = sorted(flat_data, key=lambda x: x[2], reverse=True)
+
+# Extracting the subcategory names and counts
+subcategory_names = [f"{item[1]}" for item in flat_data_sorted]
+counts = [item[2] for item in flat_data_sorted]
+
+# Calculate total count to convert frequencies to probabilities
+total_count = sum(counts)
+probabilities = [count / total_count for count in counts]  # Convert to probabilities
+
+# Prepare x and y values
+x = np.arange(len(probabilities))
+y = np.array(probabilities)
+
+# Generate a smooth curve using spline interpolation
+x_smooth = np.linspace(x.min(), x.max(), 300)  # Create 300 points between min and max of x
+spl = make_interp_spline(x, y, k=3)  # Spline of degree 3 for smooth curve
+y_smooth = spl(x_smooth)
+
+# Calculate cumulative probabilities
+cumulative_probabilities = np.cumsum(probabilities)
+cumulative_prob_smooth = make_interp_spline(x, cumulative_probabilities, k=3)(x_smooth)
+
+# Define a custom blue color gradient that starts from dark blue to light blue
+custom_blue_cmap = LinearSegmentedColormap.from_list("custom_blue", [(0, 0, 0.5), (0.5, 0.75, 1)], N=256)
+
+# Normalize the cumulative probabilities to get a gradient that moves from deep to light
+norm = plt.Normalize(vmin=cumulative_prob_smooth.min(), vmax=cumulative_prob_smooth.max())
+
+# Generate gradient color for each point based on the cumulative probability using the custom colormap
+colors = custom_blue_cmap(norm(cumulative_prob_smooth))
+# Plotting the smooth probability distribution curve with gradient based on cumulative values
+plt.figure(figsize=(12, 6))
+
+# Plot the smooth curve with the updated curve color
+plt.plot(x_smooth, y_smooth, color='black', linewidth=2)
+
+# Fill the area under the curve with a color gradient from deep to light based on cumulative probabilities
+for i in range(len(x_smooth) - 1):
+    plt.fill_between(x_smooth[i:i+2], y_smooth[i:i+2], color=colors[i], alpha=0.8)
+
+# Enhancing ICLR-like style
+plt.grid(True, which='both', axis='x', linestyle='--', linewidth=0.5)  # Horizontal gridlines only
+plt.xticks(x, subcategory_names, rotation=45, ha='right', fontsize=10,fontweight='bold')  # X-axis labels angled and aligned
+# plt.title("Smoothed Probability Distribution with Deep-to-Light Gradient", fontsize=14)
+# plt.xlabel("Category - Subcategory", fontsize=12)
+plt.ylabel("Probability", fontsize=12)
+
+# Remove the vertical grid lines (data lines along the vertical axis)
+plt.grid(False, axis='y')
+
+# Adjust the curve to start at the y-axis
+plt.xlim(left=0)  # Start from y-axis (x=0)
+plt.tight_layout()
+
+# Display the plot
+plt.show()
+plt.savefig('/gpfs/public/research/zk/lmms-eval/static_figs/statics_fig.png')
+plt.savefig('/gpfs/public/research/zk/lmms-eval/static_figs/statics_fig.pdf')
+# print(category_data)
+# 构建旭日图需要的格式
+# def create_node(name, value=None, color=None, children=None):
+#     node = {"name": name}
+#     if value:
+#         node["value"] = value
+#     if color:
+#         node["itemStyle"] = {"color": color}
+#     if children:
+#         node["children"] = children
+#     return node
+
+# # 生成旭日图数据
+# sunburst_data = []
+# for category, subcategories in category_data.items():
+#     children = [create_node(subcat, value=count) for subcat, count in subcategories.items()]
+#     sunburst_data.append(create_node(category, children=children))
+
+# # 输出旭日图数据
+# print(json.dumps(sunburst_data, indent=2))