diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
index a6285bcd..8fd1b4ae 100644
--- a/lmms_eval/__main__.py
+++ b/lmms_eval/__main__.py
@@ -172,17 +172,21 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         is_main_process = False
 
     for args in args_list:
-        if is_main_process:
-            wandb_logger = WandbLogger(args)
-        results = cli_evaluate_single(args)
-
-        accelerator.wait_for_everyone()
-        if is_main_process:
-            wandb_logger.log_eval_result(results)
-            if wandb_logger.online():
-                wandb_logger.write_to_report(results)
-            wandb_logger.finish()
-        results_list.append(results)
+        try:
+            if is_main_process and args.wandb_args:
+                wandb_logger = WandbLogger(args)
+            results = cli_evaluate_single(args)
+
+            accelerator.wait_for_everyone()
+            if is_main_process and args.wandb_args and results is not None:
+                wandb_logger.log_eval_result(results)
+                if wandb_logger.online():
+                    wandb_logger.write_to_report()
+                wandb_logger.finish()
+            results_list.append(results)
+        except Exception as e:
+            eval_logger.error(f"Error during evaluation: {e}")
+            results_list.append(None)
 
     for args, results in zip(args_list, results_list):
         # cli_evaluate will return none if the process is not the main process (rank 0)
@@ -209,7 +213,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
     elif args.tasks == "list":
         eval_logger.info("Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS))))
         sys.exit()
-    elif args.tasks == "list_tasks_num":
+    elif args.tasks == "list_with_num":
         log_message = (
             "\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
         )
diff --git a/lmms_eval/logging_utils.py b/lmms_eval/logging_utils.py
index 15b7cccc..f09bbaa6 100644
--- a/lmms_eval/logging_utils.py
+++ b/lmms_eval/logging_utils.py
@@ -71,11 +71,6 @@ def init_run(self):
         # initialize a W&B run
         self.run = wandb.init(**self.wandb_args)
 
-        # call wr inside the init_run method to avoid multiple times logging
-        import wandb.apis.reports as wr
-
-        self.wr = wr
-
     def log_eval_result(self, results):
         # Log configs to wandb
         configs = self.get_config(results)
@@ -217,80 +212,71 @@ def sanitize_results_dict(self):
         return wandb_summary, _results
 
     def prepare_report_by_task(self, results):
-        task_names = list(results.get("results", {}).keys())
+        import wandb.apis.reports as wr
+
+        task_names = list(self.results.get("results", {}).keys())
         blocks = []
         for task_name in task_names:
-            blocks.append(self.wr.H2(task_name))
+            blocks.append(wr.H2(task_name))
             panels = []
             for metric_name, metric_value in results.items():
                 if task_name in metric_name:
                     panels.append(
-                        self.wr.ScalarChart(
+                        wr.ScalarChart(
                             title=f"{metric_name}",
                             metric=f"{metric_name}",
                             font_size="large",
                         )
                     )
             _results = {
-                "results": {f"{task_name}": results.get("results").get(task_name)},
-                "versions": {f"{task_name}": results.get("versions").get(task_name)},
-                "n-shot": {f"{task_name}": results.get("n-shot").get(task_name)},
+                "results": {f"{task_name}": self.results.get("results").get(task_name)},
+                "versions": {f"{task_name}": self.results.get("versions").get(task_name)},
+                "n-shot": {f"{task_name}": self.results.get("n-shot").get(task_name)},
             }
             results_md = utils.make_table(_results)
-            blocks.extend([self.wr.MarkdownBlock(results_md), self.wr.PanelGrid(panels=panels)])
-            # blocks.extend([
-            #     self.wr.WeaveBlockSummaryTable(
-            #         project=self.run.project,
-            #         entity=self.run.entity,
-            #         table_name=f"{task_name}_eval_results",
-            #     ),
-            #     self.wr.PanelGrid(
-            #         runsets=[
-            #             self.wr.Runset(
-            #                 project=self.run.project, entity=self.run.entity,
-            #             ).set_filters_with_python_expr(f'Name == "{str(self.run.name)}"'),
-            #         ]
-            #     ),
-            # ])
+            blocks.extend([wr.MarkdownBlock(results_md), wr.PanelGrid(panels=panels)])
+            # TODO: Add results table
 
         return blocks
 
-    def write_to_report(self, results):
-        wandb_project = self.run.project
-        wandb_entity = self.run.entity
-        report = self.wr.Report(
+    def write_to_report(self):
+        import wandb.apis.reports as wr
+
+        report = wr.Report(
             project=self.run.project,
             entity=self.run.entity,
             title=f"({datetime.now().strftime('%Y-%m-%d %H:%M:%S')}) {self.run.id} - Evaluation report",
             description=f"Evaluation run by: {self.run.entity} logged to {self.run.url}",
         )
 
-        results_md = utils.make_table(results)
         task_blocks = self.prepare_report_by_task(self.wandb_results)
 
         blocks = (
             [
-                self.wr.TableOfContents(),
-                self.wr.H1("Complete Evaluation Results"),
-                self.wr.WeaveBlockSummaryTable(
+                wr.TableOfContents(),
+                wr.H1("Complete Evaluation Results"),
+                wr.WeaveBlockSummaryTable(
                     project=self.run.project,
                     entity=self.run.entity,
-                    table_name=f"evaluation/eval_results",
+                    table_name="evaluation/eval_results",
                 ),
-                self.wr.PanelGrid(
+                wr.PanelGrid(
                     runsets=[
-                        self.wr.Runset(
+                        wr.Runset(
                             project=self.run.project,
                             entity=self.run.entity,
                         ).set_filters_with_python_expr(f'Name == "{str(self.run.name)}"'),
                     ]
                 ),
-                self.wr.H1("Evaluation Results By Task"),
+                wr.H1("Evaluation Results By Task"),
             ]
             + task_blocks
             + [
-                self.wr.H1("Evaluation Config"),
-                self.wr.CodeBlock(json.dumps(self.results["config"], indent=5).split("\n"), language="json"),
+                wr.H1("Evaluation Config"),
+                wr.CodeBlock(
+                    json.dumps(self.results["config"], indent=5).split("\n"),
+                    language="json",
+                ),
                 # TODO: Add appendix
             ]
         )
diff --git a/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py b/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py
index f198c267..b012c8f2 100644
--- a/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py
+++ b/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py
@@ -47,9 +47,7 @@ def get_ltor_masks_and_position_ids(
         att_mask_batch = micro_batch_size
     else:
         att_mask_batch = 1
-    attention_mask = torch.tril(
-        torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
-    ).view(att_mask_batch, 1, seq_length, seq_length)
+    attention_mask = torch.tril(torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)).view(att_mask_batch, 1, seq_length, seq_length)
 
     # Loss mask.
     loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
@@ -66,7 +64,6 @@ def get_ltor_masks_and_position_ids(
     if reset_position_ids or reset_attention_mask:
         # Loop through the batches:
         for b in range(micro_batch_size):
-
             # Find indecies where EOD token is.
             eod_index = position_ids[b, data[b] == eod_token]
             # Detach indecies from positions if going to modify positions.
@@ -134,9 +131,7 @@ def make_context(
         nl_tokens = tokenizer.encode("\n")
 
         def _tokenize_str(role, content):
-            return f"{role}\n{content}", tokenizer.encode(
-                role, allowed_special=set(tokenizer.IMAGE_ST)
-            ) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST))
+            return f"{role}\n{content}", tokenizer.encode(role, allowed_special=set(tokenizer.IMAGE_ST)) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST))
 
         system_text, system_tokens_part = _tokenize_str("system", system)
         system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
@@ -148,22 +143,16 @@ def _tokenize_str(role, content):
             query_text, query_tokens_part = _tokenize_str("user", turn_query)
             query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
             if turn_response is not None:
-                response_text, response_tokens_part = _tokenize_str(
-                    "assistant", turn_response
-                )
+                response_text, response_tokens_part = _tokenize_str("assistant", turn_response)
                 response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
 
                 next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
-                prev_chat = (
-                    f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
-                )
+                prev_chat = f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
             else:
                 next_context_tokens = nl_tokens + query_tokens + nl_tokens
                 prev_chat = f"\n{im_start}{query_text}{im_end}\n"
 
-            current_context_size = (
-                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
-            )
+            current_context_size = len(system_tokens) + len(next_context_tokens) + len(context_tokens)
             if current_context_size < max_window_size:
                 context_tokens = next_context_tokens + context_tokens
                 raw_text = prev_chat + raw_text
@@ -172,16 +161,7 @@ def _tokenize_str(role, content):
 
         context_tokens = system_tokens + context_tokens
         raw_text = f"{im_start}{system_text}{im_end}" + raw_text
-        context_tokens += (
-            nl_tokens
-            + im_start_tokens
-            + _tokenize_str("user", query)[1]
-            + im_end_tokens
-            + nl_tokens
-            + im_start_tokens
-            + tokenizer.encode("assistant")
-            + nl_tokens
-        )
+        context_tokens += nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens + im_start_tokens + tokenizer.encode("assistant") + nl_tokens
         raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
 
     elif chat_format == "raw":
@@ -202,7 +182,7 @@ def _decode_default(
     raw_text_len: int,
     verbose: bool = False,
     return_end_reason: bool = False,
-    errors: str='replace',
+    errors: str = "replace",
 ):
     trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
     if verbose:
@@ -227,16 +207,7 @@ def _decode_default(
 
 
 def _decode_chatml(
-    tokens: List[int],
-    *,
-    stop_words: List[str],
-    eod_token_ids: List[int],
-    tokenizer: PreTrainedTokenizer,
-    raw_text_len: int,
-    context_length: int,
-    verbose: bool = False,
-    return_end_reason: bool = False,
-    errors: str='replace'
+    tokens: List[int], *, stop_words: List[str], eod_token_ids: List[int], tokenizer: PreTrainedTokenizer, raw_text_len: int, context_length: int, verbose: bool = False, return_end_reason: bool = False, errors: str = "replace"
 ):
     end_reason = f"Gen length {len(tokens)}"
     eod_token_idx = context_length
@@ -270,7 +241,7 @@ def decode_tokens(
     chat_format: str,
     verbose: bool = False,
     return_end_reason: bool = False,
-    errors: str="replace",
+    errors: str = "replace",
 ) -> str:
     if torch.is_tensor(tokens):
         tokens = tokens.cpu().numpy().tolist()
@@ -315,42 +286,19 @@ class StopWordsLogitsProcessor(LogitsProcessor):
     """
 
     def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
-
         if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
-            raise ValueError(
-                f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
-            )
+            raise ValueError(f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}.")
         if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
-            raise ValueError(
-                f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
-            )
-        if any(
-            any(
-                (not isinstance(token_id, (int, np.integer)) or token_id < 0)
-                for token_id in stop_word_ids
-            )
-            for stop_word_ids in stop_words_ids
-        ):
-            raise ValueError(
-                f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
-            )
-
-        self.stop_words_ids = list(
-            filter(
-                lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
-            )
-        )
+            raise ValueError(f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}.")
+        if any(any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in stop_word_ids) for stop_word_ids in stop_words_ids):
+            raise ValueError(f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}.")
+
+        self.stop_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids))
         self.eos_token_id = eos_token_id
         for stop_token_seq in self.stop_words_ids:
-            assert (
-                len(stop_token_seq) > 0
-            ), "Stop words token sequences {} cannot have an empty list".format(
-                stop_words_ids
-            )
-
-    def __call__(
-        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
-    ) -> torch.FloatTensor:
+            assert len(stop_token_seq) > 0, "Stop words token sequences {} cannot have an empty list".format(stop_words_ids)
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         stopped_samples = self._calc_stopped_samples(input_ids)
         for i, should_stop in enumerate(stopped_samples):
             if should_stop:
@@ -416,4 +364,4 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
 
 def switch(val1, val2, boolean):
     boolean = boolean.type_as(val1)
-    return (1 - boolean) * val1 + boolean * val2
\ No newline at end of file
+    return (1 - boolean) * val1 + boolean * val2
diff --git a/lmms_eval/models/otterhd.py b/lmms_eval/models/otterhd.py
index e9fa4635..70d6d620 100644
--- a/lmms_eval/models/otterhd.py
+++ b/lmms_eval/models/otterhd.py
@@ -172,7 +172,9 @@ def _collate(x):
                 gen_kwargs["top_p"] = None
             if "num_beams" not in gen_kwargs:
                 gen_kwargs["num_beams"] = 1
-            generation_output = self.model.generate(**model_inputs, max_new_tokens=gen_kwargs["max_new_tokens"], pad_token_id=self.tokenizer.eos_token_id)
+            generation_output = self.model.generate(
+                **model_inputs, temperature=gen_kwargs["temperature"], max_new_tokens=gen_kwargs["max_new_tokens"], top_p=gen_kwargs["top_p"], num_beams=gen_kwargs["num_beams"], pad_token_id=self.tokenizer.eos_token_id
+            )
             generation_texts = self.processor.batch_decode(generation_output, skip_special_tokens=True)
             response = [gen_text.split("\x04")[1].strip(" ").strip("\n") for gen_text in generation_texts]
             res.extend(response)
diff --git a/lmms_eval/models/qwen_vl.py b/lmms_eval/models/qwen_vl.py
index 01334c51..c3b6c7e1 100644
--- a/lmms_eval/models/qwen_vl.py
+++ b/lmms_eval/models/qwen_vl.py
@@ -133,46 +133,30 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
                 name = uuid.uuid4().hex.upper()[0:6]
                 visual.save(f"/tmp/{name}.png")
                 visual_paths.append(f"/tmp/{name}.png")
-                query.append({
-                'image' : f"/tmp/{name}.png"
-                })
-            
+                query.append({"image": f"/tmp/{name}.png"})
+
             # Make a copy for query to save context (text that needs to be masked)
-            context_query = [ _ for _ in query]
-            context_query.append({'text' : contexts})
-            query.append({'text' : contexts + continuation})
-            
+            context_query = [_ for _ in query]
+            context_query.append({"text": contexts})
+            query.append({"text": contexts + continuation})
+
             context_query = self.tokenizer.from_list_format(context_query)
             query = self.tokenizer.from_list_format(query)
-            
+
             raw_contxt_text, context_tokens = make_context(
-                    self.tokenizer,
-                    context_query,
-                    history=None,
-                    system='You are a helpful assistant',
-                    max_window_size=self.model.generation_config.max_window_size,
-                    chat_format=self.model.generation_config.chat_format
-                )
+                self.tokenizer, context_query, history=None, system="You are a helpful assistant", max_window_size=self.model.generation_config.max_window_size, chat_format=self.model.generation_config.chat_format
+            )
             context_tokens = torch.tensor([context_tokens])
-            
+
             raw_continuation_text, continuation_tokens = make_context(
-                    self.tokenizer,
-                    query,
-                    history=None,
-                    system='You are a helpful assistant',
-                    max_window_size=self.model.generation_config.max_window_size,
-                    chat_format=self.model.generation_config.chat_format
-                )
+                self.tokenizer, query, history=None, system="You are a helpful assistant", max_window_size=self.model.generation_config.max_window_size, chat_format=self.model.generation_config.chat_format
+            )
             continuation_tokens = torch.tensor([continuation_tokens]).to(self.model.device)
             attn_mask = torch.ones_like(continuation_tokens).to(self.model.device)
             labels = continuation_tokens.clone().to(self.model.device)
-            labels[:, :context_tokens.shape[1]] = -100
+            labels[:, : context_tokens.shape[1]] = -100
             with torch.inference_mode():
-                outputs = self.model(
-                    input_ids = continuation_tokens,
-                    labels=labels,
-                    attention_mask = attn_mask
-                )
+                outputs = self.model(input_ids=continuation_tokens, labels=labels, attention_mask=attn_mask)
             loss = outputs.loss
             logits = outputs["logits"]
             greedy_tokens = logits.argmax(dim=-1)
@@ -181,10 +165,9 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
             max_equal = (greedy_tokens == cont_toks).all()
             res.append((float(loss.item()), bool(max_equal)))
             pbar.update(1)
-            
+
         pbar.close()
         return res
-            
 
         assert False, "We have not implemented this function for Qwen VL yet"
 
diff --git a/lmms_eval/tasks/cmmmu/utils.py b/lmms_eval/tasks/cmmmu/utils.py
index e5899677..ca4e7b31 100644
--- a/lmms_eval/tasks/cmmmu/utils.py
+++ b/lmms_eval/tasks/cmmmu/utils.py
@@ -6,38 +6,39 @@
 from collections import Counter
 
 PROMPT = {
-    'task_instructions': [
-        '请回答以下多项选择题，并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案，那么请根据可用的数据和你的判断来选择最可能正确的选项。',
-        '请回答以下判断题，并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断，请运用你的逻辑推理和现有信息来做出最可能的判断。',
-        '请回答以下填空题，并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答，那么请依据现有的数据和你的推理能力来填写最合理的答案。'
+    "task_instructions": [
+        "请回答以下多项选择题，并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案，那么请根据可用的数据和你的判断来选择最可能正确的选项。",
+        "请回答以下判断题，并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断，请运用你的逻辑推理和现有信息来做出最可能的判断。",
+        "请回答以下填空题，并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答，那么请依据现有的数据和你的推理能力来填写最合理的答案。",
     ],
-    'multi_choice_example_format': ['问题：{}\n选项：\n{}\n正确答案：\n'],
-    'T/F_example_format': ['问题：{}\n正确答案：\n'],
-    'short_ans_example_format': ['问题：{}\n正确答案：\n'],
+    "multi_choice_example_format": ["问题：{}\n选项：\n{}\n正确答案：\n"],
+    "T/F_example_format": ["问题：{}\n正确答案：\n"],
+    "short_ans_example_format": ["问题：{}\n正确答案：\n"],
 }
 
+
 def construct_prompt(sample):
-    question = sample['question']
-    task_instructions = PROMPT['task_instructions']
-    
-    if sample['type'] == '选择':
+    question = sample["question"]
+    task_instructions = PROMPT["task_instructions"]
+
+    if sample["type"] == "选择":
         formatted_options = ""
-        start_chr = 'A'
+        start_chr = "A"
         for i in range(1, 5):
             formatted_options += f"({start_chr}) {sample[f'option{i}']}\n"
             start_chr = chr(ord(start_chr) + 1)
-        
-        current_example_template = PROMPT['multi_choice_example_format'][0]
+
+        current_example_template = PROMPT["multi_choice_example_format"][0]
         current_example = current_example_template.format(question, formatted_options)
         final_input_prompt = task_instructions[0] + "\n\n" + current_example
-    
-    elif sample['type'] == '判断':
-        current_example_template = PROMPT['T/F_example_format'][0]
+
+    elif sample["type"] == "判断":
+        current_example_template = PROMPT["T/F_example_format"][0]
         current_example = current_example_template.format(question)
         final_input_prompt = task_instructions[1] + "\n\n" + current_example
-        
+
     else:  # For fill in the blanks questions.
-        current_example_template = PROMPT['short_ans_example_format'][0]
+        current_example_template = PROMPT["short_ans_example_format"][0]
         current_example = current_example_template.format(question)
         final_input_prompt = task_instructions[2] + "\n\n" + current_example
 
@@ -46,9 +47,11 @@ def construct_prompt(sample):
 
     return final_input_prompt
 
+
 def cmmmu_doc_to_text(doc):
     return construct_prompt(doc)
 
+
 def cmmmu_doc_to_visual(doc):
     prompt = construct_prompt(doc)
     image_tokens = re.findall(r"<图片 \d+>", prompt)
@@ -57,6 +60,7 @@ def cmmmu_doc_to_visual(doc):
     visual = [doc[image_token].convert("RGB") for image_token in image_tokens]
     return visual
 
+
 def cmmmu_process_results(doc, results):
     pred = results[0]
     if doc["type"] == "选择":
@@ -68,6 +72,7 @@ def cmmmu_process_results(doc, results):
         parsed_pred = get_fill_blank_prediction(pred, doc["answer"])
     return {"cmmmu_acc": {"id": doc["id"], "subdomain": doc["subcategory"], "question_type": doc["type"], "answer": doc["answer"], "parsed_pred": parsed_pred}}
 
+
 def cmmmu_aggregate_results(results):
     evaluation_result = {}
     subset_to_eval_samples = defaultdict(list)
@@ -105,16 +110,18 @@ def cmmmu_aggregate_results(results):
     print(printable_results)
     return printable_results["Overall"]["acc"]
 
+
 def cmmmu_process_test_results_for_submission(doc, results):
     response = results[0]
     return {"cmmmu_acc": {"id": doc["id"], "type": doc["type"], "response": response}}
 
+
 def cmmmu_test_aggregate_results_for_submission(results):
     os.makedirs("./submissions", exist_ok=True)
     with open("./submissions/cmmmu_test_for_submission.jsonl", "w") as f:
         for result in results:
             json.dump(result, f, ensure_ascii=False)
-            f.write('\n')
+            f.write("\n")
     return -1
 
 
@@ -128,26 +135,26 @@ def cmmmu_test_aggregate_results_for_submission(results):
     "科学": ["生物", "化学", "地理", "数学", "物理"],
     "健康与医学": ["基础医学", "临床医学", "诊断学与实验室医学", "制药", "公共卫生"],
     "人文社会科学": ["历史", "文献学", "社会学", "心理学"],
-    "技术与工程": ["农业", "建筑学", "计算机科学", "电子学", "能源和电力", "材料", "机械工程"]
+    "技术与工程": ["农业", "建筑学", "计算机科学", "电子学", "能源和电力", "材料", "机械工程"],
 }
 
-def eval_cmmmu(entries):
 
+def eval_cmmmu(entries):
     correct_cnt = 0
     for entry in entries:
-        parsed_pred = entry.get('parsed_pred', '')
+        parsed_pred = entry.get("parsed_pred", "")
         correct = False
-        if entry.get('question_type') == '选择':
-            if parsed_pred == entry['answer']:
+        if entry.get("question_type") == "选择":
+            if parsed_pred == entry["answer"]:
                 correct_cnt += 1
                 correct = True
 
-        elif entry.get('question_type') == '填空':
-            norm_answers = normalize_str(entry['answer'], entry['answer'])
+        elif entry.get("question_type") == "填空":
+            norm_answers = normalize_str(entry["answer"], entry["answer"])
 
             for pred in parsed_pred:
                 # already normalized
-                if isinstance(pred, str): # if it's a string, then find if ans in the pred_i
+                if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
                     for norm_ans in norm_answers:
                         # only see if the string answer in the string pred
                         # print(norm_ans, pred)
@@ -156,7 +163,7 @@ def eval_cmmmu(entries):
                                 correct_cnt += 1
                                 correct = True
                             break
-                else: # it's a number
+                else:  # it's a number
                     if pred in norm_answers:
                         if not correct:
                             correct_cnt += 1
@@ -164,9 +171,10 @@ def eval_cmmmu(entries):
                         break
 
         else:
-            positive_keywords = ['正确', '对', '准确', '肯定', '对的']
-            negative_keywords = ['不对', '错误', '不正确', '不准确', '不合适', '否定', '错的', '错']
-            ambiguous_keywords = ['对错', '是否正确', '否正确', '或者', '是否', '正确性', '对不']
+            positive_keywords = ["正确", "对", "准确", "肯定", "对的"]
+            negative_keywords = ["不对", "错误", "不正确", "不准确", "不合适", "否定", "错的", "错"]
+            ambiguous_keywords = ["对错", "是否正确", "否正确", "或者", "是否", "正确性", "对不"]
+
             def judge_similarity(pred_list, positive_keywords, negative_keywords):
                 positive_count = 0
                 negative_count = 0
@@ -182,50 +190,43 @@ def judge_similarity(pred_list, positive_keywords, negative_keywords):
                 elif negative_count > positive_count:
                     return "错"
                 else:
-                    return random.choice(['对', '错'])
-            answer = entry['answer']
+                    return random.choice(["对", "错"])
+
+            answer = entry["answer"]
             parsed_pred = [word for word in parsed_pred if not any(ambiguous in word for ambiguous in ambiguous_keywords)]
             result = judge_similarity(parsed_pred, positive_keywords, negative_keywords)
             if result == answer:
                 correct_cnt += 1
                 correct = True
         if correct:
-            entry['judge'] = '正确'
+            entry["judge"] = "正确"
         else:
-            entry['judge'] = '错误'
+            entry["judge"] = "错误"
 
     if len(entries) == 0:
-        print('entries_num == 0, please check your file')
-        results_count = {
-        'correct_num': 0,
-        'entries_num': 0,
-        'acc': 0
-        }
+        print("entries_num == 0, please check your file")
+        results_count = {"correct_num": 0, "entries_num": 0, "acc": 0}
     else:
-         results_count = {
-        'correct_num': correct_cnt,
-        'entries_num': len(entries),
-        'acc': correct_cnt / len(entries)
-        }
+        results_count = {"correct_num": correct_cnt, "entries_num": len(entries), "acc": correct_cnt / len(entries)}
 
     return results_count
 
 
 def get_multi_choice_prediction(response, all_choices, index2ans):
-    for char in [',', '.', '!', '?', ';', ':',"'"]:
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
         response = response.strip(char)
-    response = " " + response + " " # add space to avoid partial match
+    response = " " + response + " "  # add space to avoid partial match
 
     candidates = []
 
     for choice in all_choices:  # (A) (B) (C) (D)
         # Add the choice to candidates each time it appears in the response
-        candidates.extend([choice for _ in range(response.count(f'({choice})'))])
+        candidates.extend([choice for _ in range(response.count(f"({choice})"))])
 
     if len(candidates) == 0:
         for choice in all_choices:  # A B C D
             # Similarly, add the choice for each occurrence
-            candidates.extend([choice for _ in range(response.count(f'{choice}'))])
+            candidates.extend([choice for _ in range(response.count(f"{choice}"))])
 
     if len(candidates) == 0 and len(response.split()) >= 1:
         for index, ans in index2ans.items():
@@ -237,7 +238,7 @@ def get_multi_choice_prediction(response, all_choices, index2ans):
         for index, ans in index2ans.items():
             if ans in response:
                 candidates.append(index)
-                index_ans = False # it's content ans.
+                index_ans = False  # it's content ans.
 
     if len(candidates) == 0:  # still not get answer, randomly choose one.
         return random.choice(all_choices)
@@ -251,15 +252,16 @@ def get_multi_choice_prediction(response, all_choices, index2ans):
         most_frequent_candidates = [c for c in all_choices if candidate_counts.get(c, 0) == max_count]
 
         # Combine the most frequent candidates in ABCD order
-        return ''.join(most_frequent_candidates)
+        return "".join(most_frequent_candidates)
+
 
 def extract_numbers(string):
     # Pattern for numbers with Chinese commas
-    pattern_commas = r'-?\d{1,3}(?:，\d{3})+'
+    pattern_commas = r"-?\d{1,3}(?:，\d{3})+"
     # Pattern for scientific notation
-    pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+'
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
     # Pattern for simple numbers without Chinese commas
-    pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!，\d)'
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!，\d)"
 
     # Extract numbers with Chinese commas
     numbers_with_commas = re.findall(pattern_commas, string)
@@ -272,16 +274,19 @@ def extract_numbers(string):
     all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
     return all_numbers
 
+
 def check_is_number(string):
     try:
-        float(string.replace(',', ''))
+        float(string.replace(",", ""))
         return True
     except ValueError:
         # check if there's comma inside
         return False
 
+
 def count_letters(string):
-    return sum(c.isalpha() and 'a' <= c <= 'z' or 'A' <= c <= 'Z' for c in string)
+    return sum(c.isalpha() and "a" <= c <= "z" or "A" <= c <= "Z" for c in string)
+
 
 def normalize_str(string, answer):
     # check if characters in the string
@@ -294,16 +299,17 @@ def normalize_str(string, answer):
     is_number = check_is_number(string)
 
     if is_number:
-        string = string.replace(',', '')
+        string = string.replace(",", "")
         string = float(string)
         # leave 2 decimal
         string = round(string, 2)
         return [string]
-    else: # it's likely to be a string
+    else:  # it's likely to be a string
         if len(string) > len(answer) + 20 or count_letters(string) > count_letters(answer) + 2:
             return []
         return [string]
 
+
 def get_fill_blank_prediction(response, answer):
     """get the prediction from the generated response,
     return a list of predicted strings or numbers"""
@@ -311,15 +317,14 @@ def get_fill_blank_prediction(response, answer):
     def get_key_subresponses(response):
         key_responses = []
         response = response.strip("。").strip()
-        sub_responses = re.split(r'。|\n', response)
-        indicators_of_keys = ['是', '为', '所以', '等于', '方案', '选择',
-                            '正确答案', '因此', '最后', '答案', '结果']
+        sub_responses = re.split(r"。|\n", response)
+        indicators_of_keys = ["是", "为", "所以", "等于", "方案", "选择", "正确答案", "因此", "最后", "答案", "结果"]
         key_responses = []
         for index, resp in enumerate(sub_responses):
             # if last one, accept it's an equation (the entire response can be just one sentence with equation)
             if index == len(sub_responses) - 1:
-                indicators_of_keys.extend(['='])
-            shortest_key_response = None # the shortest response that may contain the answer (tail part of the response)
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
             for indicator in indicators_of_keys:
                 if indicator in resp:
                     if not shortest_key_response:
@@ -327,18 +332,18 @@ def get_key_subresponses(response):
                     else:
                         if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
                             shortest_key_response = resp.split(indicator)[-1].strip()
-            
+
             if shortest_key_response:
                 # and it's not trivial
                 if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
                     key_responses.append(shortest_key_response)
-        if len(key_responses) == 0: # did not found any
+        if len(key_responses) == 0:  # did not found any
             return [response]
         return key_responses
 
     key_responses = get_key_subresponses(response)
 
-    pred_list = key_responses.copy() # keep the original string response
+    pred_list = key_responses.copy()  # keep the original string response
     for resp in key_responses:
         pred_list.extend(extract_numbers(resp))
 
@@ -352,6 +357,7 @@ def get_key_subresponses(response):
 
     return pred_list
 
+
 def get_TF_prediction(response):
     """get the prediction from the generated response,
     return a list of predicted strings or numbers"""
@@ -359,12 +365,11 @@ def get_TF_prediction(response):
     def get_key_subresponses(response):
         key_responses = []
         response = response.strip("。").strip()
-        sub_responses = re.split(r'。|\n', response)
-        indicators_of_keys = ['是', '为', '所以', '判断',
-                            '陈述', '说法', '表达', '答案', '结果']
+        sub_responses = re.split(r"。|\n", response)
+        indicators_of_keys = ["是", "为", "所以", "判断", "陈述", "说法", "表达", "答案", "结果"]
         key_responses = []
         for index, resp in enumerate(sub_responses):
-            shortest_key_response = None # the shortest response that may contain the answer (tail part of the response)
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
             for indicator in indicators_of_keys:
                 if indicator in resp:
                     if not shortest_key_response:
@@ -372,25 +377,25 @@ def get_key_subresponses(response):
                     else:
                         if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
                             shortest_key_response = resp.split(indicator)[-1].strip()
-            
+
             if shortest_key_response:
                 # and it's not trivial
                 if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
                     key_responses.append(shortest_key_response)
-        if len(key_responses) == 0: # did not found any
+        if len(key_responses) == 0:  # did not found any
             return [response]
         return key_responses
 
     key_responses = get_key_subresponses(response)
 
-    pred_list = key_responses.copy() # keep the original string response
+    pred_list = key_responses.copy()  # keep the original string response
     # remove duplicates
     pred_list = list(set(pred_list))
 
     return pred_list
 
-def get_multi_choice_info(options):
 
+def get_multi_choice_info(options):
     start_chr = "A"
     all_choices = []
     index2ans = {}
@@ -400,8 +405,8 @@ def get_multi_choice_info(options):
 
     return index2ans, all_choices
 
-def calculate_ins_level_acc(results):
 
+def calculate_ins_level_acc(results):
     correct_sum = 0
     entries_sum = 0
     for cat_results in results.values():
@@ -409,4 +414,4 @@ def calculate_ins_level_acc(results):
         entries_sum += cat_results["entries_num"]
     if entries_sum == 0:
         return 0
-    return correct_sum / entries_sum
\ No newline at end of file
+    return correct_sum / entries_sum
diff --git a/lmms_eval/tasks/dc100_en/utils.py b/lmms_eval/tasks/dc100_en/utils.py
index 318f4c21..d3b47ab7 100644
--- a/lmms_eval/tasks/dc100_en/utils.py
+++ b/lmms_eval/tasks/dc100_en/utils.py
@@ -2,6 +2,7 @@
 import requests
 import re
 import logging
+import time
 import os
 import yaml
 from pathlib import Path
@@ -36,7 +37,7 @@ def doc_to_visual(doc):
 Provide a few lines for explanation and the rate number at last after "Final Score:"."""
 
 
-def get_chat_response(base64_image, prompt, max_retries=3):
+def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
     headers = {
         "Authorization": f"Bearer {API_KEY}",
         "Content-Type": "application/json",
@@ -68,8 +69,13 @@ def get_chat_response(base64_image, prompt, max_retries=3):
             return response_data["choices"][0]["message"]["content"]
         except requests.exceptions.RequestException as e:
             eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")
+            time.sleep(wait_time)
             if attempt == max_retries - 1:
-                raise
+                eval_logger.error(f"Failed to get response after {max_retries} attempts")
+                return ""
+        except Exception as e:
+            eval_logger.error(f"Error on attempt {attempt+1}: {e}")
+            return ""
 
 
 def image_to_base64(pil_image):
diff --git a/lmms_eval/tasks/dc200_cn/utils.py b/lmms_eval/tasks/dc200_cn/utils.py
index bd00f4b5..1e729150 100644
--- a/lmms_eval/tasks/dc200_cn/utils.py
+++ b/lmms_eval/tasks/dc200_cn/utils.py
@@ -6,6 +6,7 @@
 import yaml
 from pathlib import Path
 from io import BytesIO
+import time
 
 
 def doc_to_visual(doc):
@@ -36,7 +37,7 @@ def doc_to_visual(doc):
 Provide a few lines for explanation and the rate number at last after "Final Score:"."""
 
 
-def get_chat_response(base64_image, prompt, max_retries=3):
+def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
     headers = {
         "Authorization": f"Bearer {API_KEY}",
         "Content-Type": "application/json",
@@ -68,8 +69,13 @@ def get_chat_response(base64_image, prompt, max_retries=3):
             return response_data["choices"][0]["message"]["content"]
         except requests.exceptions.RequestException as e:
             eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")
+            time.sleep(wait_time)
             if attempt == max_retries - 1:
-                raise
+                eval_logger.error(f"Failed to get response after {max_retries} attempts")
+                return ""
+        except Exception as e:
+            eval_logger.error(f"Error on attempt {attempt+1}: {e}")
+            return ""
 
 
 def image_to_base64(pil_image):
diff --git a/lmms_eval/tasks/iconqa/iconqa_test.yaml b/lmms_eval/tasks/iconqa/iconqa_test.yaml
new file mode 100644
index 00000000..58aa9e70
--- /dev/null
+++ b/lmms_eval/tasks/iconqa/iconqa_test.yaml
@@ -0,0 +1,23 @@
+dataset_path: lmms-lab/ICON-QA
+dataset_kwargs:
+  token: True
+task: "iconqa_test"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.doc_to_visual
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "answers"
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  do_sample: False
+metric_list:
+  - metric: anls
+    aggregation: mean
+    higher_is_better: true
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    statement: "Given a set of images and a question, please provide the answer to the question.\n"
+    options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly."
+    freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase."
\ No newline at end of file
diff --git a/lmms_eval/tasks/iconqa/iconqa_val.yaml b/lmms_eval/tasks/iconqa/iconqa_val.yaml
new file mode 100644
index 00000000..7215715e
--- /dev/null
+++ b/lmms_eval/tasks/iconqa/iconqa_val.yaml
@@ -0,0 +1,23 @@
+dataset_path: lmms-lab/ICON-QA
+dataset_kwargs:
+  token: True
+task: "iconqa_val"
+test_split: val
+output_type: generate_until
+doc_to_visual: !function utils.doc_to_visual
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "answers"
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  do_sample: False
+metric_list:
+  - metric: anls
+    aggregation: mean
+    higher_is_better: true
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    statement: "Given a set of images and a question, please provide the answer to the question.\n"
+    options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly."
+    freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase."
\ No newline at end of file
diff --git a/lmms_eval/tasks/iconqa/utils.py b/lmms_eval/tasks/iconqa/utils.py
new file mode 100644
index 00000000..5d833fec
--- /dev/null
+++ b/lmms_eval/tasks/iconqa/utils.py
@@ -0,0 +1,62 @@
+import json
+import os
+
+
+def options_to_str(options_prompt):
+    option_prompt_str = ""
+    for i, option in enumerate(options_prompt):
+        option_choice = chr(ord("A") + i)
+        option_prompt_str += f"{option_choice}. {option}\n"
+
+    option_prompt_str = option_prompt_str.rstrip("\n")
+    return option_prompt_str
+
+
+def doc_to_visual(doc):
+    image_list = []
+    if "query_image" in doc:
+        image_list.append(doc["query_image"].convert("RGB"))
+    if "choice_image_0" in doc:
+        image_list.append(doc["choice_image_0"].convert("RGB"))
+    if "choice_image_1" in doc:
+        image_list.append(doc["choice_image_1"].convert("RGB"))
+    if "choice_image_2" in doc:
+        image_list.append(doc["choice_image_2"].convert("RGB"))
+    if "choice_image_3" in doc:
+        image_list.append(doc["choice_image_3"].convert("RGB"))
+    if "choice_image_4" in doc:
+        image_list.append(doc["choice_image_4"].convert("RGB"))
+    assert len(image_list) < 6, "Maximum 5 images allowed for ICON-QA"
+    return image_list
+
+
+def doc_to_text(doc, model_specific_prompt_kwargs):
+    question = doc["question"]
+    ques_type = doc["ques_type"]
+    options_prompt = []
+
+    if ques_type == "choose_img":
+        options_prompt.append("The first image.")
+        options_prompt.append("The second image.")
+
+        options_str = options_to_str(options_prompt)
+        full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
+
+    elif ques_type == "choose_txt":
+        choices = doc["choices"].split(",")
+        for i, choice in enumerate(choices):
+            options_prompt.append(f"{choice}")
+
+        options_str = options_to_str(options_prompt)
+        full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
+
+    elif ques_type == "fill_in_blank":
+        full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['freeform_statement'].format(question=question)}"
+
+    return full_prompt
+
+
+def test_process_results(doc, results):
+    pred = results[0]
+    questionId = doc["questionId"]
+    return {"anls": {"questionId": int(questionId), "answer": pred}}
diff --git a/lmms_eval/tasks/multidocvqa/utils.py b/lmms_eval/tasks/multidocvqa/utils.py
index 33241cf3..698e25cc 100644
--- a/lmms_eval/tasks/multidocvqa/utils.py
+++ b/lmms_eval/tasks/multidocvqa/utils.py
@@ -12,36 +12,41 @@ def multidocvqa_doc_to_text(doc, model_specific_prompt_kwargs):
 
     return f"{pre_prompt}{question}{post_prompt}"
 
+
 def multidocvqa_doc_to_visual(doc):
-    return [doc[f'image_{i}'].convert("RGB") for i in range(1, 21) if doc[f'image_{i}'] is not None]
+    return [doc[f"image_{i}"].convert("RGB") for i in range(1, 21) if doc[f"image_{i}"] is not None]
+
 
 def multidocvqa_process_results(doc, results):
     pred_answer = results[0]
-    answer = ast.literal_eval(doc['answers'])
+    answer = ast.literal_eval(doc["answers"])
+
+    return {"anls": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}, "accuracy": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}}
 
-    return {"anls": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer},
-            "accuracy": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}}
 
 def multidocvqa_aggregate_results_anls(results):
     keys = {k for result in results for k in result}
     results = {key: [result.get(key, None) for result in results] for key in keys}
     evaluator = Evaluator(case_sensitive=False)
-    metric = evaluator.get_metrics(results['answer'], results['pred_answer'])
+    metric = evaluator.get_metrics(results["answer"], results["pred_answer"])
+
+    return sum(metric["anls"]) / len(metric["anls"])
 
-    return sum(metric['anls']) / len(metric['anls'])
 
 def multidocvqa_aggregate_results_accuracy(results):
     keys = {k for result in results for k in result}
     results = {key: [result.get(key, None) for result in results] for key in keys}
     evaluator = Evaluator(case_sensitive=False)
-    metric = evaluator.get_metrics(results['answer'], results['pred_answer'])
+    metric = evaluator.get_metrics(results["answer"], results["pred_answer"])
+
+    return sum(metric["accuracy"]) / len(metric["accuracy"])
 
-    return sum(metric['accuracy']) / len(metric['accuracy'])
 
 def multidocvqa_process_test_results_for_submission(doc, results):
     answer = results[0]
     return {"submission": {"questionId": int(doc["questionId"]), "answer": answer, "answer_page": None}}
 
+
 def multidocvqa_test_aggregate_results_for_submission(results):
     os.makedirs("./submissions", exist_ok=True)
     with open("./submissions/multidocvqa_test_for_submission.json", "w") as f:
@@ -56,7 +61,6 @@ def multidocvqa_test_aggregate_results_for_submission(results):
 
 class Evaluator:
     def __init__(self, case_sensitive=False):
-
         self.case_sensitive = case_sensitive
         self.get_edit_distance = levenshtein_distance
         self.anls_threshold = 0.5
@@ -71,7 +75,7 @@ def get_metrics(self, gt_answers, preds):
             batch_accuracy.append(self._calculate_accuracy(gt, pred))
             batch_anls.append(self._calculate_anls(gt, pred))
 
-        return {'accuracy': batch_accuracy, 'anls': batch_anls}
+        return {"accuracy": batch_accuracy, "anls": batch_anls}
 
     def _preprocess_str(self, string):
         if not self.case_sensitive:
@@ -80,8 +84,7 @@ def _preprocess_str(self, string):
         return string.strip()
 
     def _calculate_accuracy(self, gt, pred):
-
-        if pred == 'none':
+        if pred == "none":
             return 0
 
         for gt_elm in gt:
@@ -94,7 +97,7 @@ def _calculate_anls(self, gt, pred):
         if len(pred) == 0:
             return 0
 
-        if pred == 'none':
+        if pred == "none":
             return 0
 
         answers_similarity = [1 - self.get_edit_distance(gt_elm, pred) / max(len(gt_elm), len(pred)) for gt_elm in gt]
@@ -102,7 +105,8 @@ def _calculate_anls(self, gt, pred):
 
         anls = max_similarity if max_similarity >= self.anls_threshold else 0
         return anls
-    
-if __name__ == '__main__':
-    print('-----------------')
-    multidocvqa_aggregate_results_anls([{"questionId": 1, "answer": ["answer"], "pred_answer": "pred_answer"}, {"questionId": 2, "answer": ["nswer"], "pred_answer": "nswer"}])
\ No newline at end of file
+
+
+if __name__ == "__main__":
+    print("-----------------")
+    multidocvqa_aggregate_results_anls([{"questionId": 1, "answer": ["answer"], "pred_answer": "pred_answer"}, {"questionId": 2, "answer": ["nswer"], "pred_answer": "nswer"}])
diff --git a/lmms_eval/tasks/seedbench_2/utils.py b/lmms_eval/tasks/seedbench_2/utils.py
index 5ec7fccc..abcc97b8 100644
--- a/lmms_eval/tasks/seedbench_2/utils.py
+++ b/lmms_eval/tasks/seedbench_2/utils.py
@@ -4,21 +4,23 @@
 def seed_doc_to_visual(doc):
     return [image.convert("RGB") for image in doc["image"]]
 
-def parse_choice_img(choice : str, img_token : str):
+
+def parse_choice_img(choice: str, img_token: str):
     if "jpg" in choice or "png" in choice:
         return img_token
     return choice
 
-def seed_doc_to_text(doc, model_specific_kwargs = None):
+
+def seed_doc_to_text(doc, model_specific_kwargs=None):
     question = doc["question"]
-    question.replace("<img>", model_specific_kwargs['img_token'])
+    question.replace("<img>", model_specific_kwargs["img_token"])
     question += "\n" + f"A. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}\n"
     question += f"B. {parse_choice_img(doc['choice_b'], model_specific_kwargs['img_token'])}\n"
     question += f"C. {parse_choice_img(doc['choice_c'], model_specific_kwargs['img_token'])}\n"
     question += f"D. {parse_choice_img(doc['choice_d'], model_specific_kwargs['img_token'])}"
-    if (doc['data_type'] == "Image Generation"):
-        num_img_in_question = len(doc['image']) - 4
-        prepend_tokens = [model_specific_kwargs['img_token']] * num_img_in_question
+    if doc["data_type"] == "Image Generation":
+        num_img_in_question = len(doc["image"]) - 4
+        prepend_tokens = [model_specific_kwargs["img_token"]] * num_img_in_question
         question = " ".join(prepend_tokens) + "\n" + question
     return f"{question}\n{model_specific_kwargs['post_prompt']}"
 
diff --git a/lmms_eval/tasks/stvqa/utils.py b/lmms_eval/tasks/stvqa/utils.py
index 1f106f06..bb317f69 100644
--- a/lmms_eval/tasks/stvqa/utils.py
+++ b/lmms_eval/tasks/stvqa/utils.py
@@ -1,21 +1,25 @@
 import os
 import json
 
+
 def stvqa_doc_to_text(doc, model_specific_prompt_kwargs):
     question = doc["question"]
     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
     post_prompt = model_specific_prompt_kwargs["post_prompt"]
     return f"{pre_prompt}{question}{post_prompt}"
 
+
 def stvqa_doc_to_visual(doc):
     return [doc["image"].convert("RGB")]
 
+
 def stvqa_process_results(doc, results):
     answer = results[0]
     return {"submission": {"question_id": int(doc["question_id"]), "answer": answer}}
 
+
 def stvqa_aggregate_submissions(results):
     os.makedirs("./submissions", exist_ok=True)
     with open("./submissions/stvqa_test_for_submission.json", "w") as f:
         json.dump(results, f)
-    return -1
\ No newline at end of file
+    return -1