diff --git a/docs/task_guide.md b/docs/task_guide.md index 1376bc22..1e7d3a9d 100755 --- a/docs/task_guide.md +++ b/docs/task_guide.md @@ -40,7 +40,7 @@ metric_list: - metric: mme_cognition_score aggregation: !function utils.mme_aggregate_results higher_is_better: true -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question using a single word or phrase." @@ -52,7 +52,7 @@ metadata: ``` You can pay special attention to the `process_results` and `metric_list` fields, which are used to define how the model output is post-processed and scored. -Also, the `model_specific_prompt_kwargs` field is used to define model-specific prompt configurations. The default is set to follow Llava. +Also, the `lmms_eval_specific_kwargs` field is used to define model-specific prompt configurations. The default is set to follow Llava. PPL-based tasks: - Seedbench (`lmms_eval/tasks/seedbench/seedbench_ppl.yaml`) diff --git a/lmms_eval/tasks/ai2d/ai2d_lite.yaml b/lmms_eval/tasks/ai2d/ai2d_lite.yaml index bdeb9724..b71abe80 100644 --- a/lmms_eval/tasks/ai2d/ai2d_lite.yaml +++ b/lmms_eval/tasks/ai2d/ai2d_lite.yaml @@ -9,7 +9,7 @@ doc_to_visual: !function utils.ai2d_doc_to_visual doc_to_text: !function utils.ai2d_doc_to_text doc_to_target: !function utils.ai2d_doc_to_target -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: prompt_format: mcq pre_prompt: "" diff --git a/lmms_eval/tasks/chartqa/chartqa_lite.yaml b/lmms_eval/tasks/chartqa/chartqa_lite.yaml index 4fbce497..96daff5f 100644 --- a/lmms_eval/tasks/chartqa/chartqa_lite.yaml +++ b/lmms_eval/tasks/chartqa/chartqa_lite.yaml @@ -25,7 +25,7 @@ metric_list: higher_is_better: true metadata: - version: 0.0 -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question with a single word." diff --git a/lmms_eval/tasks/docvqa/docvqa_val_lite.yaml b/lmms_eval/tasks/docvqa/docvqa_val_lite.yaml index afda5eb5..95d065df 100644 --- a/lmms_eval/tasks/docvqa/docvqa_val_lite.yaml +++ b/lmms_eval/tasks/docvqa/docvqa_val_lite.yaml @@ -16,7 +16,7 @@ generation_kwargs: max_new_tokens: 32 temperature: 0 do_sample: False -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question using a single word or phrase." diff --git a/lmms_eval/tasks/gqa/gqa_lite.yaml b/lmms_eval/tasks/gqa/gqa_lite.yaml index 81042041..7f432fc4 100644 --- a/lmms_eval/tasks/gqa/gqa_lite.yaml +++ b/lmms_eval/tasks/gqa/gqa_lite.yaml @@ -23,7 +23,7 @@ metric_list: metadata: - version: 0.0 -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question using a single word or phrase." diff --git a/lmms_eval/tasks/infovqa/infovqa_val_lite.yaml b/lmms_eval/tasks/infovqa/infovqa_val_lite.yaml index eab75120..f52ded6e 100644 --- a/lmms_eval/tasks/infovqa/infovqa_val_lite.yaml +++ b/lmms_eval/tasks/infovqa/infovqa_val_lite.yaml @@ -16,7 +16,7 @@ generation_kwargs: max_new_tokens: 32 temperature: 0 do_sample: False -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question using a single word or phrase." \ No newline at end of file diff --git a/lmms_eval/tasks/mirb/mirb.yaml b/lmms_eval/tasks/mirb/mirb.yaml index b42e3c84..099e578c 100644 --- a/lmms_eval/tasks/mirb/mirb.yaml +++ b/lmms_eval/tasks/mirb/mirb.yaml @@ -10,7 +10,7 @@ doc_to_text: !function utils.mirb_doc_to_text doc_to_target: !function utils.mirb_doc_to_target process_results: !function utils.mirb_process_results -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "" diff --git a/lmms_eval/tasks/mirb/utils.py b/lmms_eval/tasks/mirb/utils.py index 174659dc..3e675d39 100644 --- a/lmms_eval/tasks/mirb/utils.py +++ b/lmms_eval/tasks/mirb/utils.py @@ -24,11 +24,11 @@ def get_task_instruction(dataset): return instr -def mirb_doc_to_text(doc, model_specific_prompt_kwargs=None): +def mirb_doc_to_text(doc, lmms_eval_specific_kwargs=None): subset, question = doc["subset"], doc["questions"] task_instruction = get_task_instruction(subset) - post_prompt = model_specific_prompt_kwargs["post_prompt"] - pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + post_prompt = lmms_eval_specific_kwargs["post_prompt"] + pre_prompt = lmms_eval_specific_kwargs["pre_prompt"] return f"{pre_prompt}{task_instruction}{question}{post_prompt}" diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_dev_lite.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_dev_lite.yaml index 4fc38f96..78b36a65 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_dev_lite.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_dev_lite.yaml @@ -22,7 +22,7 @@ generation_kwargs: num_beams: 1 do_sample: false process_results: !function cn_utils.mmbench_process_results -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" diff --git a/lmms_eval/tasks/mmbench/mmbench_en_dev_lite.yaml b/lmms_eval/tasks/mmbench/mmbench_en_dev_lite.yaml index 60b9574e..226c4843 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_dev_lite.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_dev_lite.yaml @@ -5,7 +5,7 @@ dataset_name: mmbench_en_dev dataset_kwargs: token: True doc_to_target: "answer" -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer with the option's letter from the given choices directly." diff --git a/lmms_eval/tasks/ok_vqa/ok_vqa_val2014_lite.yaml b/lmms_eval/tasks/ok_vqa/ok_vqa_val2014_lite.yaml index 60b76f73..7567e696 100644 --- a/lmms_eval/tasks/ok_vqa/ok_vqa_val2014_lite.yaml +++ b/lmms_eval/tasks/ok_vqa/ok_vqa_val2014_lite.yaml @@ -20,7 +20,7 @@ metric_list: aggregation: !function utils.ok_vqa_aggregate_submissions higher_is_better: true process_results: !function utils.ok_vqa_process_results -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." diff --git a/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml b/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml index c47a3dba..71620676 100755 --- a/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml +++ b/lmms_eval/tasks/seedbench_2_plus/seedbench_2_plus.yaml @@ -31,7 +31,7 @@ metric_list: metadata: - version: 0.0 -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: llava : img_token : post_prompt : "Answer with the option's letter from the given choices directly." diff --git a/lmms_eval/tasks/textcaps/textcaps_val_lite.yaml b/lmms_eval/tasks/textcaps/textcaps_val_lite.yaml index 95525e07..a72a40b6 100644 --- a/lmms_eval/tasks/textcaps/textcaps_val_lite.yaml +++ b/lmms_eval/tasks/textcaps/textcaps_val_lite.yaml @@ -43,6 +43,6 @@ metric_list: # higher_is_better : true metadata: - version: 0.0 -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: prompt: Provide a one-sentence caption for the provided image. \ No newline at end of file diff --git a/lmms_eval/tasks/textvqa/textvqa_val_lite.yaml b/lmms_eval/tasks/textvqa/textvqa_val_lite.yaml index 5438228a..dc18a09f 100644 --- a/lmms_eval/tasks/textvqa/textvqa_val_lite.yaml +++ b/lmms_eval/tasks/textvqa/textvqa_val_lite.yaml @@ -19,7 +19,7 @@ generation_kwargs: until: - "ASSISTANT:" process_results: !function utils.textvqa_process_results -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question using a single word or phrase." diff --git a/lmms_eval/tasks/vibe_eval/utils.py b/lmms_eval/tasks/vibe_eval/utils.py index 09b0c58a..71ad4afa 100644 --- a/lmms_eval/tasks/vibe_eval/utils.py +++ b/lmms_eval/tasks/vibe_eval/utils.py @@ -8,6 +8,8 @@ import os from copy import deepcopy +from loguru import logger as eval_logger + try: from reka import ChatMessage from reka.client import Reka diff --git a/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val_lite.yaml b/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val_lite.yaml index d686d3ff..915fa7cd 100644 --- a/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val_lite.yaml +++ b/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val_lite.yaml @@ -11,7 +11,7 @@ generation_kwargs: - "ASSISTANT:" metadata: - version: 0.0 -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." diff --git a/lmms_eval/tasks/vqav2/vqav2_val_lite.yaml b/lmms_eval/tasks/vqav2/vqav2_val_lite.yaml index 1446c8df..c0211c62 100644 --- a/lmms_eval/tasks/vqav2/vqav2_val_lite.yaml +++ b/lmms_eval/tasks/vqav2/vqav2_val_lite.yaml @@ -11,7 +11,7 @@ generation_kwargs: max_new_tokens: 16 metadata: - version: 0.0 -model_specific_prompt_kwargs: +lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question using a single word or phrase."