From dfbddf814732276b49e427350fd526d507ef23ad Mon Sep 17 00:00:00 2001 From: Kaichen Zhang - NTU Date: Wed, 25 Sep 2024 15:52:16 +0800 Subject: [PATCH] [Fix] Model name None in Task manager, mix eval model specific kwargs, claude retrying fix (#278) * Fix task manager model name None issue * Change model specific args in mix eval to lmms eval kwargs * json dump indent 4 * lint * Fix claude always retrying 5 time error --- lmms_eval/__main__.py | 4 +-- lmms_eval/models/claude.py | 7 ++++- lmms_eval/tasks/mix_evals/utils.py | 42 +++++++++++++++--------------- 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py index a50e1ce9..ec2d7858 100755 --- a/lmms_eval/__main__.py +++ b/lmms_eval/__main__.py @@ -365,7 +365,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: if args.include_path is not None: eval_logger.info(f"Including path: {args.include_path}") - task_manager = TaskManager(args.verbosity, include_path=args.include_path) + task_manager = TaskManager(args.verbosity, include_path=args.include_path, model_name=args.model) # update the evaluation tracker args with the output path and the HF token if args.output_path: @@ -392,8 +392,6 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: if args.include_path is not None: eval_logger.info(f"Including path: {args.include_path}") - task_manager = TaskManager(args.verbosity, include_path=args.include_path) - if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples: eval_logger.warning("Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub.") diff --git a/lmms_eval/models/claude.py b/lmms_eval/models/claude.py index edb72b0f..823823dc 100644 --- a/lmms_eval/models/claude.py +++ b/lmms_eval/models/claude.py @@ -232,8 +232,10 @@ def generate_until(self, requests) -> List[str]: gen_kwargs["num_beams"] = 1 for attempt in range(5): + retry_flag = True try: message = client.messages.create(model=self.model_version, max_tokens=gen_kwargs["max_new_tokens"], system=self.system_prompt, temperature=gen_kwargs["temperature"], top_p=gen_kwargs["top_p"], messages=messages) + retry_flag = False except Exception as e: eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}") if attempt < 5 - 1: # If we have retries left, sleep and then continue to next attempt @@ -243,6 +245,9 @@ def generate_until(self, requests) -> List[str]: res.append("") pbar.update(1) continue + if not retry_flag: + break + eval_logger.info("Retrying...") response_text = message.content[0].text res.append(message.content[0].text) @@ -254,7 +259,7 @@ def generate_until(self, requests) -> List[str]: doc_uuid = f"{task}___{split}___{doc_id}" self.response_cache[doc_uuid] = response_text with open(self.response_persistent_file, "w") as f: - json.dump(self.response_cache, f) + json.dump(self.response_cache, f, indent=4) pbar.close() diff --git a/lmms_eval/tasks/mix_evals/utils.py b/lmms_eval/tasks/mix_evals/utils.py index 71a7ce3f..cd1d8e5b 100644 --- a/lmms_eval/tasks/mix_evals/utils.py +++ b/lmms_eval/tasks/mix_evals/utils.py @@ -130,15 +130,15 @@ def mix_evals_video2text_doc_to_visual(doc): # This is the place where you format your question -def mix_evals_video2text_doc_to_text(doc, model_specific_prompt_kwargs=None): - if model_specific_prompt_kwargs is None: - model_specific_prompt_kwargs = {} +def mix_evals_video2text_doc_to_text(doc, lmms_eval_specific_kwargs=None): + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} pre_prompt = "" post_prompt = "" - if "pre_prompt" in model_specific_prompt_kwargs: - pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - if "post_prompt" in model_specific_prompt_kwargs: - post_prompt = model_specific_prompt_kwargs["post_prompt"] + if "pre_prompt" in lmms_eval_specific_kwargs: + pre_prompt = lmms_eval_specific_kwargs["pre_prompt"] + if "post_prompt" in lmms_eval_specific_kwargs: + post_prompt = lmms_eval_specific_kwargs["post_prompt"] user_prompt = doc["prompt"] @@ -166,15 +166,15 @@ def mix_evals_video2text_doc_to_text(doc, model_specific_prompt_kwargs=None): """ -def mix_evals_video2text_doc_to_text_open_convs(doc, model_specific_prompt_kwargs=None): - if model_specific_prompt_kwargs is None: - model_specific_prompt_kwargs = {} +def mix_evals_video2text_doc_to_text_open_convs(doc, lmms_eval_specific_kwargs=None): + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} pre_prompt = "" post_prompt = "" - if "pre_prompt" in model_specific_prompt_kwargs: - pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - if "post_prompt" in model_specific_prompt_kwargs: - post_prompt = model_specific_prompt_kwargs["post_prompt"] + if "pre_prompt" in lmms_eval_specific_kwargs: + pre_prompt = lmms_eval_specific_kwargs["pre_prompt"] + if "post_prompt" in lmms_eval_specific_kwargs: + post_prompt = lmms_eval_specific_kwargs["post_prompt"] filtered_first_turn = re.sub(r"", "", doc["first_turn_user_prompt"]) return OPEN_CONVS_PROMPT.format( @@ -192,15 +192,15 @@ def mix_evals_video2text_doc_to_text_open_convs(doc, model_specific_prompt_kwarg """ -def mix_evals_video2text_doc_to_text_open_2nd_convs(doc, model_specific_prompt_kwargs=None): - if model_specific_prompt_kwargs is None: - model_specific_prompt_kwargs = {} +def mix_evals_video2text_doc_to_text_open_2nd_convs(doc, lmms_eval_specific_kwargs=None): + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} pre_prompt = "" post_prompt = "" - if "pre_prompt" in model_specific_prompt_kwargs: - pre_prompt = model_specific_prompt_kwargs["pre_prompt"] - if "post_prompt" in model_specific_prompt_kwargs: - post_prompt = model_specific_prompt_kwargs["post_prompt"] + if "pre_prompt" in lmms_eval_specific_kwargs: + pre_prompt = lmms_eval_specific_kwargs["pre_prompt"] + if "post_prompt" in lmms_eval_specific_kwargs: + post_prompt = lmms_eval_specific_kwargs["post_prompt"] return MODEL_CONVS_PROMPT.format( PRE=pre_prompt,