From d8f1f9cf7ad824b8b5315ab6fafb8768fb4345d6 Mon Sep 17 00:00:00 2001 From: Ren Lingfeng Date: Sat, 14 Dec 2024 02:32:05 +0800 Subject: [PATCH] Mmvetv2 (#458) * add task MMVet-v2 * Fix lint warnings using pre-commit * add comment for mmvetv2_group_img.yaml * fix images_tokens format --- lmms_eval/tasks/mmvetv2/mmvetv2.yaml | 1 + .../tasks/mmvetv2/mmvetv2_group_img.yaml | 1 + lmms_eval/tasks/mmvetv2/utils.py | 40 +++++-------------- 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/lmms_eval/tasks/mmvetv2/mmvetv2.yaml b/lmms_eval/tasks/mmvetv2/mmvetv2.yaml index fa22b4ea..184e4917 100644 --- a/lmms_eval/tasks/mmvetv2/mmvetv2.yaml +++ b/lmms_eval/tasks/mmvetv2/mmvetv2.yaml @@ -21,6 +21,7 @@ metric_list: metadata: version: 0.0 gpt_eval_model_name: "gpt-4-0613" + interleaved_format: false lmms_eval_specific_kwargs: default: pre_prompt: "First please perform reasoning, and think step by step to provide best answer to the following question: \n\n" diff --git a/lmms_eval/tasks/mmvetv2/mmvetv2_group_img.yaml b/lmms_eval/tasks/mmvetv2/mmvetv2_group_img.yaml index 9f712f7f..c361a119 100644 --- a/lmms_eval/tasks/mmvetv2/mmvetv2_group_img.yaml +++ b/lmms_eval/tasks/mmvetv2/mmvetv2_group_img.yaml @@ -23,6 +23,7 @@ metric_list: metadata: version: 0.0 gpt_eval_model_name: "gpt-4-0613" + interleaved_format: false lmms_eval_specific_kwargs: default: pre_prompt: "First please perform reasoning, and think step by step to provide best answer to the following question: \n\n" diff --git a/lmms_eval/tasks/mmvetv2/utils.py b/lmms_eval/tasks/mmvetv2/utils.py index eae03970..5cc426c9 100644 --- a/lmms_eval/tasks/mmvetv2/utils.py +++ b/lmms_eval/tasks/mmvetv2/utils.py @@ -138,48 +138,30 @@ def process_images(images, size=1008): return concat_horizontal -def get_images_tokens(input_string): - images = [] - queries = input_string.split("") - for query in queries: - query = query.strip() - if query.endswith((".jpg", ".png", ".jpeg")): - # image_path = os.path.join(image_folder, query) - # images.append(Image.open(image_path).convert("RGB")) - images.append(query) - return images - - def mmvet_group_img_doc_to_visual(doc): - # if doc["image"] is None: - # return [] prompt = doc["question"] - image_tokens = get_images_tokens(prompt) - visual = [doc[image_token].convert("RGB") for image_token in image_tokens] + image_tokens = re.findall(r"", prompt) + visual = [doc[image_token.strip("<>")].convert("RGB") for image_token in image_tokens] visual = process_images(visual) return [visual] def mmvet_doc_to_visual(doc): - # if doc["image"] is None: - # return [] prompt = doc["question"] - image_tokens = get_images_tokens(prompt) - visual = [doc[image_token].convert("RGB") for image_token in image_tokens] + image_tokens = re.findall(r"", prompt) + visual = [doc[image_token.strip("<>")].convert("RGB") for image_token in image_tokens] return visual def replace_images_tokens(input_string): - text_queries = [] + if config["metadata"]["interleaved_format"]: + for i in range(0, 18): + question_text = f"" + query_text = "" + if question_text in input_string: + input_string = input_string.replace(question_text, query_text) queries = input_string.split("") - for query in queries: - query = query.strip() - if query.endswith((".jpg", ".png", ".jpeg")): - text_queries.append("[]") - else: - text_queries.append(query) - question = "".join(text_queries) - return question + return "".join(queries) def doc_to_text(doc, lmms_eval_specific_kwargs=None):