diff --git a/docs/current_tasks.md b/docs/current_tasks.md index 1622e960..6e0e5238 100644 --- a/docs/current_tasks.md +++ b/docs/current_tasks.md @@ -50,6 +50,7 @@ - MMMU (mmmu) - MMMU Validation (mmmu_val) - MMMU Test (mmmu_test) +- MMStar (mmstar) - MMUPD (mmupd) - MMUPD Base (mmupd_base) - MMAAD Base (mmaad_base) diff --git a/lmms_eval/tasks/mmstar/mmstar.yaml b/lmms_eval/tasks/mmstar/mmstar.yaml new file mode 100644 index 00000000..84597846 --- /dev/null +++ b/lmms_eval/tasks/mmstar/mmstar.yaml @@ -0,0 +1,37 @@ +dataset_path: Lin-Chen/MMStar +dataset_kwargs: + token: True +task: "mmstar" +test_split: val +output_type: generate_until +doc_to_visual: !function utils.mmstar_doc_to_visual +doc_to_text: !function utils.mmstar_doc_to_text +doc_to_target: "answer" +# The return value of process_results will be used by metrics +process_results: !function utils.mmstar_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: coarse perception + aggregation: !function utils.mmstar_aggregate_results + higher_is_better: true + - metric: fine-grained perception + aggregation: !function utils.mmstar_aggregate_results + higher_is_better: true + - metric: instance reasoning + aggregation: !function utils.mmstar_aggregate_results + higher_is_better: true + - metric: logical reasoning + aggregation: !function utils.mmstar_aggregate_results + higher_is_better: true + - metric: science & technology + aggregation: !function utils.mmstar_aggregate_results + higher_is_better: true + - metric: math + aggregation: !function utils.mmstar_aggregate_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly" +metadata: + - version: 0.0 diff --git a/lmms_eval/tasks/mmstar/utils.py b/lmms_eval/tasks/mmstar/utils.py new file mode 100644 index 00000000..66933f29 --- /dev/null +++ b/lmms_eval/tasks/mmstar/utils.py @@ -0,0 +1,120 @@ +from collections import defaultdict +import os +import datetime +import json +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + + +from loguru import logger as eval_logger + +dir_name = os.path.dirname(os.path.abspath(__file__)) + +eval_type_dict = { + "coarse perception" : [ + "image scene and topic", + "image style & quality", + "image emotion" + ], + "fine-grained perception" : [ + "object counting", + "recognition", + "localization" + ], + "instance reasoning" : [ + "single-instance reasoning", + "cross-instance attribute reasoning", + "cross-instance relation reasoning" + ], + "logical reasoning" : [ + "code & sequence reasoning", + "diagram reasoning", + "common reasoning" + ], + "science & technology" : [ + "biology & chemistry & physics", + "electronics & energy & mechanical eng.", + "geography & earth science & agriculture" + ], + "math" : [ + "geometry", + "numeric commonsense and calculation", + "statistical reasoning" + ] +} + + +replace_prompt = " Please answer yes or no." + + +def mmstar_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def mmstar_doc_to_text(doc, model_specific_prompt_kwargs=None): + question = doc["question"].strip() + if "pre_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["pre_prompt"] != "": + question = question.replace(replace_prompt, "") + question = f"{model_specific_prompt_kwargs['pre_prompt']}{question}" + if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"] != "": + question = question.replace(replace_prompt, "") + question = f"{question}{model_specific_prompt_kwargs['post_prompt']}" + return question + + +def exact_match(pred, gt): + """Brought from MMStar""" + answer = gt.lower().strip().replace('\n', ' ') + predict = pred.lower().strip().replace('\n', ' ') + try: + if answer == predict[0]: + return 1.0 + elif predict[0] == '(' and answer == predict[1]: + return 1.0 + elif predict[0:7] == 'option ' and answer == predict[7]: + return 1.0 + elif predict[0:14] == 'the answer is ' and answer == predict[14]: + return 1.0 + except Exception as e: + return 0.0 + return 0.0 + + +def mmstar_process_results(doc, results): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name, value: metric value + """ + pred = results[0] + gt = doc["answer"] + + score = exact_match(pred, gt) + category = doc["category"] + l2_category = doc["l2_category"] + return {category: {"question_id": doc["index"], "l2_category": l2_category, "score": score}} + + +def mmstar_aggregate_results(results): + """ + Args: + results: a list of values returned by process_results + Returns: + A score + """ + l2_category_scores = defaultdict(list) + for result in results: + score = result["score"] + l2_category = result["l2_category"] + l2_category_scores[l2_category].append(score) + + l2_category_avg_score = {} + for l2_category, scores in l2_category_scores.items(): + avg_score = sum(scores) / len(scores) + l2_category_avg_score[l2_category] = avg_score + eval_logger.info(f"{l2_category}: {avg_score:.2f}") + + avg_score = sum(l2_category_avg_score.values()) / len(l2_category_avg_score) + return avg_score + \ No newline at end of file