From dfbddf814732276b49e427350fd526d507ef23ad Mon Sep 17 00:00:00 2001
From: Kaichen Zhang - NTU <kaichenzhang358@outlook.com>
Date: Wed, 25 Sep 2024 15:52:16 +0800
Subject: [PATCH] [Fix] Model name None in Task manager, mix eval model
 specific kwargs, claude retrying fix (#278)

* Fix task manager model name None issue

* Change model specific args in mix eval to lmms eval kwargs

* json dump indent 4

* lint

* Fix claude always retrying 5 time error
---
 lmms_eval/__main__.py              |  4 +--
 lmms_eval/models/claude.py         |  7 ++++-
 lmms_eval/tasks/mix_evals/utils.py | 42 +++++++++++++++---------------
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
index a50e1ce9..ec2d7858 100755
--- a/lmms_eval/__main__.py
+++ b/lmms_eval/__main__.py
@@ -365,7 +365,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
 
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
-    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
+    task_manager = TaskManager(args.verbosity, include_path=args.include_path, model_name=args.model)
 
     # update the evaluation tracker args with the output path and the HF token
     if args.output_path:
@@ -392,8 +392,6 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
 
-    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
-
     if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
         eval_logger.warning("Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub.")
 
diff --git a/lmms_eval/models/claude.py b/lmms_eval/models/claude.py
index edb72b0f..823823dc 100644
--- a/lmms_eval/models/claude.py
+++ b/lmms_eval/models/claude.py
@@ -232,8 +232,10 @@ def generate_until(self, requests) -> List[str]:
                 gen_kwargs["num_beams"] = 1
 
             for attempt in range(5):
+                retry_flag = True
                 try:
                     message = client.messages.create(model=self.model_version, max_tokens=gen_kwargs["max_new_tokens"], system=self.system_prompt, temperature=gen_kwargs["temperature"], top_p=gen_kwargs["top_p"], messages=messages)
+                    retry_flag = False
                 except Exception as e:
                     eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
                     if attempt < 5 - 1:  # If we have retries left, sleep and then continue to next attempt
@@ -243,6 +245,9 @@ def generate_until(self, requests) -> List[str]:
                         res.append("")
                         pbar.update(1)
                         continue
+                if not retry_flag:
+                    break
+                eval_logger.info("Retrying...")
 
             response_text = message.content[0].text
             res.append(message.content[0].text)
@@ -254,7 +259,7 @@ def generate_until(self, requests) -> List[str]:
                 doc_uuid = f"{task}___{split}___{doc_id}"
                 self.response_cache[doc_uuid] = response_text
                 with open(self.response_persistent_file, "w") as f:
-                    json.dump(self.response_cache, f)
+                    json.dump(self.response_cache, f, indent=4)
 
         pbar.close()
 
diff --git a/lmms_eval/tasks/mix_evals/utils.py b/lmms_eval/tasks/mix_evals/utils.py
index 71a7ce3f..cd1d8e5b 100644
--- a/lmms_eval/tasks/mix_evals/utils.py
+++ b/lmms_eval/tasks/mix_evals/utils.py
@@ -130,15 +130,15 @@ def mix_evals_video2text_doc_to_visual(doc):
 
 
 # This is the place where you format your question
-def mix_evals_video2text_doc_to_text(doc, model_specific_prompt_kwargs=None):
-    if model_specific_prompt_kwargs is None:
-        model_specific_prompt_kwargs = {}
+def mix_evals_video2text_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
     pre_prompt = ""
     post_prompt = ""
-    if "pre_prompt" in model_specific_prompt_kwargs:
-        pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    if "post_prompt" in model_specific_prompt_kwargs:
-        post_prompt = model_specific_prompt_kwargs["post_prompt"]
+    if "pre_prompt" in lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
+    if "post_prompt" in lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs["post_prompt"]
 
     user_prompt = doc["prompt"]
 
@@ -166,15 +166,15 @@ def mix_evals_video2text_doc_to_text(doc, model_specific_prompt_kwargs=None):
 """
 
 
-def mix_evals_video2text_doc_to_text_open_convs(doc, model_specific_prompt_kwargs=None):
-    if model_specific_prompt_kwargs is None:
-        model_specific_prompt_kwargs = {}
+def mix_evals_video2text_doc_to_text_open_convs(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
     pre_prompt = ""
     post_prompt = ""
-    if "pre_prompt" in model_specific_prompt_kwargs:
-        pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    if "post_prompt" in model_specific_prompt_kwargs:
-        post_prompt = model_specific_prompt_kwargs["post_prompt"]
+    if "pre_prompt" in lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
+    if "post_prompt" in lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs["post_prompt"]
 
     filtered_first_turn = re.sub(r"<video_[0-9]+>", "", doc["first_turn_user_prompt"])
     return OPEN_CONVS_PROMPT.format(
@@ -192,15 +192,15 @@ def mix_evals_video2text_doc_to_text_open_convs(doc, model_specific_prompt_kwarg
 """
 
 
-def mix_evals_video2text_doc_to_text_open_2nd_convs(doc, model_specific_prompt_kwargs=None):
-    if model_specific_prompt_kwargs is None:
-        model_specific_prompt_kwargs = {}
+def mix_evals_video2text_doc_to_text_open_2nd_convs(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
     pre_prompt = ""
     post_prompt = ""
-    if "pre_prompt" in model_specific_prompt_kwargs:
-        pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
-    if "post_prompt" in model_specific_prompt_kwargs:
-        post_prompt = model_specific_prompt_kwargs["post_prompt"]
+    if "pre_prompt" in lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
+    if "post_prompt" in lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs["post_prompt"]
 
     return MODEL_CONVS_PROMPT.format(
         PRE=pre_prompt,