diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py index a5cc7d68..687c4a65 100644 --- a/lmms_eval/api/metrics.py +++ b/lmms_eval/api/metrics.py @@ -6,6 +6,7 @@ import sklearn.metrics import random import evaluate +import torch from lmms_eval.api.registry import register_metric, register_aggregation @@ -20,11 +21,6 @@ def mean(arr): return sum(arr) / len(arr) -@register_aggregation("sum") -def mean(arr): - return sum(arr) - - @register_aggregation("median") def median(arr): return arr[len(arr) // 2] @@ -35,6 +31,7 @@ def median(arr): @register_aggregation("perplexity") def perplexity(items): # return math.exp(-mean(items)) + items = torch.exp(torch.tensor(items)).tolist() return sum(items) / len(items) diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py index f3c03d37..a9a3ddc0 100644 --- a/lmms_eval/api/task.py +++ b/lmms_eval/api/task.py @@ -9,6 +9,7 @@ import datasets import numpy as np +from PIL import ImageFile from typing import Union, List, Any from collections.abc import Callable @@ -38,6 +39,10 @@ eval_logger = logging.getLogger("lmms-eval") +# HuggingfaceM4/NoCaps contains truncated image in test split +# Include this inside code block to avoid error +ImageFile.LOAD_TRUNCATED_IMAGES = True + @dataclass class TaskConfig(dict): @@ -837,7 +842,7 @@ def doc_to_choice(self, doc: Any) -> List[str]: def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]: if self.OUTPUT_TYPE == "loglikelihood": - arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual(doc)) + arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) elif self.OUTPUT_TYPE == "loglikelihood_rolling": arguments = (self.doc_to_target(doc),) elif self.OUTPUT_TYPE == "multiple_choice": @@ -846,11 +851,11 @@ def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instan if self.multiple_input: # If there are multiple inputs, choices are placed in the ctx cont = self.doc_to_target(doc) - arguments = [(ctx, f"{target_delimiter}{cont}") for ctx in choices] + arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for ctx in choices] else: # Otherwise they are placed in the continuation - arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices] - + arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for cont in choices] + kwargs.pop("split") request_list = [ Instance( request_type="loglikelihood", diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py index 0e11bab0..2033f093 100644 --- a/lmms_eval/models/llava.py +++ b/lmms_eval/models/llava.py @@ -145,8 +145,10 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: res = [] pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") - for contexts, continuation, visuals in [reg.args for reg in requests]: + for contexts, continuation, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: # encode, pad, and truncate contexts for this batch + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) if visuals: image = process_images(visuals, self._image_processor, self._config) if type(image) is list: @@ -186,7 +188,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: with torch.inference_mode(): outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True) loss = outputs["loss"] - loss = torch.exp(loss) + # loss = torch.exp(loss) logits = outputs["logits"] greedy_tokens = logits.argmax(dim=-1) cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq] diff --git a/lmms_eval/tasks/coco/coco_train.yaml b/lmms_eval/tasks/coco/coco_train.yaml index 98419369..52be75f7 100644 --- a/lmms_eval/tasks/coco/coco_train.yaml +++ b/lmms_eval/tasks/coco/coco_train.yaml @@ -40,8 +40,8 @@ metric_list: - metric: coco_CIDEr aggregation : !function utils.coco_cider higher_is_better : true - - metric: coco_SPICE - aggregation : !function utils.coco_spice - higher_is_better : true + #- metric: coco_SPICE + # aggregation : !function utils.coco_spice + # higher_is_better : true metadata: - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/coco/coco_val.yaml b/lmms_eval/tasks/coco/coco_val.yaml index 871937ff..264052ba 100644 --- a/lmms_eval/tasks/coco/coco_val.yaml +++ b/lmms_eval/tasks/coco/coco_val.yaml @@ -40,8 +40,8 @@ metric_list: - metric: coco_CIDEr aggregation : !function utils.coco_cider higher_is_better : true - - metric: coco_SPICE - aggregation : !function utils.coco_spice - higher_is_better : true + #- metric: coco_SPICE + # aggregation : !function utils.coco_spice + # higher_is_better : true metadata: - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/coco/utils.py b/lmms_eval/tasks/coco/utils.py index 5a219f0a..e82c3590 100644 --- a/lmms_eval/tasks/coco/utils.py +++ b/lmms_eval/tasks/coco/utils.py @@ -4,9 +4,13 @@ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer from pycocotools.coco import COCO +import logging + +eval_logger = logging.getLogger("lmms-eval") + dir_name = os.path.dirname(os.path.abspath(__file__)) -COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"] +COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] def coco_doc_to_visual(doc): @@ -71,12 +75,12 @@ def coco_aggregation_result(results, metric): gts[imgId] = coco_eval.coco.imgToAnns[imgId] res[imgId] = coco_eval.cocoRes.imgToAnns[imgId] - print("tokenization...") + eval_logger.info("tokenization...") tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) - print(f"Computing {metric} scores...") + eval_logger.info(f"Computing {metric} scores...") score, scores = scorers_dict[metric][0].compute_score(gts, res) # When metric is one of the Bleu, score will be a list @@ -84,6 +88,11 @@ def coco_aggregation_result(results, metric): n = int(metric.split("_")[-1]) score = score[n - 1] + if not os.path.exists("./captions_val2014_alg_results.json"): + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open("./captions_val2014_alg_results.json", "w") as f: + json.dump(stored_results, f, indent=4) + return score @@ -127,8 +136,21 @@ def coco_test_process_result(doc, result): Returns: a dictionary with key: metric name (in this case coco_passthrough), value: metric value """ - return {"coco_passthrough": {"pred": result}} + question_id = doc["question_id"] + # The question id in our dataset is the image file itself + image_id = int(question_id.split("_")[-1].split(".")[0]) + return {"coco_passthrough": {"pred": result, "image_id": image_id}} def coco_test_aggregation_result(results): + stored_results = [] + for result in results: + stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]}) + + if not os.path.exists("./captions_test2014_alg_results.json"): + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open("./captions_test2014_alg_results.json", "w") as f: + json.dump(stored_results, f, indent=4) + + eval_logger.info("Your test result has been stored. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.") return -1 diff --git a/lmms_eval/tasks/nocaps/nocaps.yaml b/lmms_eval/tasks/nocaps/nocaps.yaml new file mode 100644 index 00000000..f85c250a --- /dev/null +++ b/lmms_eval/tasks/nocaps/nocaps.yaml @@ -0,0 +1,3 @@ +group : nocaps +task: + - nocaps_caption \ No newline at end of file diff --git a/lmms_eval/tasks/nocaps/nocaps_test.yaml b/lmms_eval/tasks/nocaps/nocaps_test.yaml new file mode 100644 index 00000000..ec2de9fb --- /dev/null +++ b/lmms_eval/tasks/nocaps/nocaps_test.yaml @@ -0,0 +1,26 @@ +dataset_path: HuggingFaceM4/NoCaps +dataset_kwargs: + token: True +task : "nocaps_test" +group : "nocaps_caption" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.nocaps_doc_to_visual +doc_to_text: !function utils.nocaps_doc_to_text +doc_to_target: "annotations_captions" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.nocaps_test_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: nocaps_passthrough + aggregation : !function utils.nocaps_test_aggregation_result + higher_is_better : true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/nocaps/nocaps_val.yaml b/lmms_eval/tasks/nocaps/nocaps_val.yaml new file mode 100644 index 00000000..3308eebd --- /dev/null +++ b/lmms_eval/tasks/nocaps/nocaps_val.yaml @@ -0,0 +1,47 @@ +dataset_path: HuggingFaceM4/NoCaps +dataset_kwargs: + token: True +task: "nocaps_val" +group : "nocaps_caption" +test_split: validation +output_type: generate_until +doc_to_visual: !function utils.nocaps_doc_to_visual +doc_to_text: !function utils.nocaps_doc_to_text +doc_to_target: "annotations_captions" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.nocaps_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: nocaps_Bleu_4 + aggregation : !function utils.nocaps_bleu4 + higher_is_better : true + - metric: nocaps_Bleu_3 + aggregation : !function utils.nocaps_bleu3 + higher_is_better : true + - metric: nocaps_Bleu_2 + aggregation : !function utils.nocaps_bleu2 + higher_is_better : true + - metric: nocaps_Bleu_1 + aggregation : !function utils.nocaps_bleu1 + higher_is_better : true + - metric: nocaps_METEOR + aggregation : !function utils.nocaps_meteor + higher_is_better : true + - metric: nocaps_ROUGE_L + aggregation : !function utils.nocaps_rougel + higher_is_better : true + - metric: nocaps_CIDEr + aggregation : !function utils.nocaps_cider + higher_is_better : true + #- metric: nocaps_SPICE + # aggregation : !function utils.nocaps_spice + # higher_is_better : true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/nocaps/utils.py b/lmms_eval/tasks/nocaps/utils.py new file mode 100644 index 00000000..4c996e27 --- /dev/null +++ b/lmms_eval/tasks/nocaps/utils.py @@ -0,0 +1,151 @@ +import os +import json +from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice +from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer +from pycocotools.coco import COCO + +import logging + +eval_logger = logging.getLogger("lmms-eval") + +dir_name = os.path.dirname(os.path.abspath(__file__)) + +NOCAPS_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] + + +def nocaps_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def nocaps_doc_to_text(doc): + question = "Please carefully observe the image and come up with a caption for the image" + return f"{question}\nAnswer the question using a single word or phrase." + + +def nocaps_process_result(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name, value: metric value + """ + pred = result[0] + # The question id in our dataset is the image file itself + image_id = doc["image_id"] + + data_dict = {"answer": doc["annotations_captions"], "pred": pred, "image_id": image_id} + + return {f"nocaps_{metric}": data_dict for metric in NOCAPS_METRICS} + + +def nocaps_aggregation_result(results, metric): + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] + scorers_dict = {s[1]: s for s in scorers} + + stored_results = [] + # In order to make the coco eval tools to successfully create index + # We need at least two dict in the dataset + # 'annotation' and 'images' + # 'annotation' exactly reproduce the original annotation + # 'images' however only need the image id which is contained in the file name + dataset = {"annotations": [], "images": []} + idx = 0 + for result in results: + stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]}) + for a in result["answer"]: + dataset["annotations"].append({"image_id": int(result["image_id"]), "caption": a, "id": idx}) + idx += 1 + dataset["images"].append({"id": result["image_id"]}) + + coco = COCO() + # Manually create index here + coco.dataset = dataset + coco.createIndex() + + nocaps_result = coco.loadRes(stored_results) + nocaps_eval = COCOEvalCap(coco, nocaps_result) + + imgIds = nocaps_eval.params["image_id"] + gts = {} + res = {} + for imgId in imgIds: + gts[imgId] = nocaps_eval.coco.imgToAnns[imgId] + res[imgId] = nocaps_eval.cocoRes.imgToAnns[imgId] + + eval_logger.info("tokenization...") + tokenizer = PTBTokenizer() + gts = tokenizer.tokenize(gts) + res = tokenizer.tokenize(res) + + eval_logger.info(f"Computing {metric} scores...") + + score, scores = scorers_dict[metric][0].compute_score(gts, res) + # When metric is one of the Bleu, score will be a list + if type(score) == list: + n = int(metric.split("_")[-1]) + score = score[n - 1] + + if not os.path.exists("./captions_nocaps_val_alg_results.json"): + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open("./captions_nocaps_val_alg_results.json", "w") as f: + json.dump(stored_results, f, indent=4) + + return score + + +def nocaps_bleu4(results): + return nocaps_aggregation_result(results, "Bleu_4") + + +def nocaps_bleu3(results): + return nocaps_aggregation_result(results, "Bleu_3") + + +def nocaps_bleu2(results): + return nocaps_aggregation_result(results, "Bleu_2") + + +def nocaps_bleu1(results): + return nocaps_aggregation_result(results, "Bleu_1") + + +def nocaps_meteor(results): + return nocaps_aggregation_result(results, "METEOR") + + +def nocaps_rougel(results): + return nocaps_aggregation_result(results, "ROUGE_L") + + +def nocaps_cider(results): + return nocaps_aggregation_result(results, "CIDEr") + + +def nocaps_spice(results): + return nocaps_aggregation_result(results, "SPICE") + + +def nocaps_test_process_result(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case nocaps_passthrough), value: metric value + """ + return {"nocaps_passthrough": {"pred": result, "image_id": doc["image_id"]}} + + +def nocaps_test_aggregation_result(results): + stored_results = [] + for result in results: + stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]}) + + if not os.path.exists("./captions_nocaps_test_alg_results.json"): + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open("./captions_nocaps_val_alg_results.json", "w") as f: + json.dump(stored_results, f, indent=4) + + eval_logger.info("Your test result has been stored. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.") + return -1 diff --git a/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml b/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml index 9f4d3213..82fb9914 100644 --- a/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml +++ b/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml @@ -29,8 +29,8 @@ metric_list: - metric: refcoco_CIDEr aggregation : !function utils.refcoco_cider higher_is_better : true - - metric: refcoco_SPICE - aggregation : !function utils.refcoco_spice - higher_is_better : true + #- metric: refcoco_SPICE + # aggregation : !function utils.refcoco_spice + # higher_is_better : true metadata: version: '0.0' \ No newline at end of file diff --git a/lmms_eval/tasks/refcoco+/_default_template_seg_yaml b/lmms_eval/tasks/refcoco+/_default_template_seg_yaml index 230d4a33..2c38d513 100644 --- a/lmms_eval/tasks/refcoco+/_default_template_seg_yaml +++ b/lmms_eval/tasks/refcoco+/_default_template_seg_yaml @@ -29,8 +29,8 @@ metric_list: - metric: refcoco_CIDEr aggregation : !function utils.refcoco_cider higher_is_better : true - - metric: refcoco_SPICE - aggregation : !function utils.refcoco_spice - higher_is_better : true + #- metric: refcoco_SPICE + # aggregation : !function utils.refcoco_spice + # higher_is_better : true metadata: version: '0.0' \ No newline at end of file diff --git a/lmms_eval/tasks/refcoco+/utils.py b/lmms_eval/tasks/refcoco+/utils.py index 537eec2f..fbec626c 100644 --- a/lmms_eval/tasks/refcoco+/utils.py +++ b/lmms_eval/tasks/refcoco+/utils.py @@ -3,7 +3,11 @@ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer from pycocotools.coco import COCO -COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"] +COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] + +import logging + +eval_logger = logging.getLogger("lmms-eval") def refcoco_bbox_doc_to_visual(doc): @@ -79,12 +83,12 @@ def refcoco_aggregation_result(results, metric): gts[imgId] = coco_eval.coco.imgToAnns[imgId] res[imgId] = coco_eval.cocoRes.imgToAnns[imgId] - print("tokenization...") + eval_logger.info("tokenization...") tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) - print(f"Computing {metric} scores...") + eval_logger.info(f"Computing {metric} scores...") score, scores = scorers_dict[metric][0].compute_score(gts, res) # coco_eval.setEval(score, metric) diff --git a/lmms_eval/tasks/refcoco/_default_template_bbox_yaml b/lmms_eval/tasks/refcoco/_default_template_bbox_yaml index d45ffabd..b5cc6b0f 100644 --- a/lmms_eval/tasks/refcoco/_default_template_bbox_yaml +++ b/lmms_eval/tasks/refcoco/_default_template_bbox_yaml @@ -29,8 +29,8 @@ metric_list: - metric: refcoco_CIDEr aggregation : !function utils.refcoco_cider higher_is_better : true - - metric: refcoco_SPICE - aggregation : !function utils.refcoco_spice - higher_is_better : true + #- metric: refcoco_SPICE + # aggregation : !function utils.refcoco_spice + # higher_is_better : true metadata: version: '0.0' \ No newline at end of file diff --git a/lmms_eval/tasks/refcoco/_default_template_seg_yaml b/lmms_eval/tasks/refcoco/_default_template_seg_yaml index 9ee5558d..85456541 100644 --- a/lmms_eval/tasks/refcoco/_default_template_seg_yaml +++ b/lmms_eval/tasks/refcoco/_default_template_seg_yaml @@ -29,8 +29,8 @@ metric_list: - metric: refcoco_CIDEr aggregation : !function utils.refcoco_cider higher_is_better : true - - metric: refcoco_SPICE - aggregation : !function utils.refcoco_spice - higher_is_better : true + #- metric: refcoco_SPICE + # aggregation : !function utils.refcoco_spice + # higher_is_better : true metadata: version: '0.0' \ No newline at end of file diff --git a/lmms_eval/tasks/refcoco/utils.py b/lmms_eval/tasks/refcoco/utils.py index 01bcace4..ed2b1344 100644 --- a/lmms_eval/tasks/refcoco/utils.py +++ b/lmms_eval/tasks/refcoco/utils.py @@ -3,7 +3,11 @@ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer from pycocotools.coco import COCO -COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"] +COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] + +import logging + +eval_logger = logging.getLogger("lmms-eval") def refcoco_bbox_doc_to_visual(doc): @@ -79,12 +83,12 @@ def refcoco_aggregation_result(results, metric): gts[imgId] = coco_eval.coco.imgToAnns[imgId] res[imgId] = coco_eval.cocoRes.imgToAnns[imgId] - print("tokenization...") + eval_logger.info("tokenization...") tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) - print(f"Computing {metric} scores...") + eval_logger.info(f"Computing {metric} scores...") score, scores = scorers_dict[metric][0].compute_score(gts, res) # coco_eval.setEval(score, metric) diff --git a/lmms_eval/tasks/refcocog/_default_template_bbox_yaml b/lmms_eval/tasks/refcocog/_default_template_bbox_yaml index a88d7b65..9abd94f6 100644 --- a/lmms_eval/tasks/refcocog/_default_template_bbox_yaml +++ b/lmms_eval/tasks/refcocog/_default_template_bbox_yaml @@ -29,8 +29,8 @@ metric_list: - metric: refcoco_CIDEr aggregation : !function utils.refcoco_cider higher_is_better : true - - metric: refcoco_SPICE - aggregation : !function utils.refcoco_spice - higher_is_better : true + #- metric: refcoco_SPICE + # aggregation : !function utils.refcoco_spice + # higher_is_better : true metadata: version: '0.0' \ No newline at end of file diff --git a/lmms_eval/tasks/refcocog/_default_template_seg_yaml b/lmms_eval/tasks/refcocog/_default_template_seg_yaml index 9ee5558d..85456541 100644 --- a/lmms_eval/tasks/refcocog/_default_template_seg_yaml +++ b/lmms_eval/tasks/refcocog/_default_template_seg_yaml @@ -29,8 +29,8 @@ metric_list: - metric: refcoco_CIDEr aggregation : !function utils.refcoco_cider higher_is_better : true - - metric: refcoco_SPICE - aggregation : !function utils.refcoco_spice - higher_is_better : true + #- metric: refcoco_SPICE + # aggregation : !function utils.refcoco_spice + # higher_is_better : true metadata: version: '0.0' \ No newline at end of file diff --git a/lmms_eval/tasks/refcocog/utils.py b/lmms_eval/tasks/refcocog/utils.py index 537eec2f..fbec626c 100644 --- a/lmms_eval/tasks/refcocog/utils.py +++ b/lmms_eval/tasks/refcocog/utils.py @@ -3,7 +3,11 @@ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer from pycocotools.coco import COCO -COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"] +COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] + +import logging + +eval_logger = logging.getLogger("lmms-eval") def refcoco_bbox_doc_to_visual(doc): @@ -79,12 +83,12 @@ def refcoco_aggregation_result(results, metric): gts[imgId] = coco_eval.coco.imgToAnns[imgId] res[imgId] = coco_eval.cocoRes.imgToAnns[imgId] - print("tokenization...") + eval_logger.info("tokenization...") tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) - print(f"Computing {metric} scores...") + eval_logger.info(f"Computing {metric} scores...") score, scores = scorers_dict[metric][0].compute_score(gts, res) # coco_eval.setEval(score, metric) diff --git a/lmms_eval/tasks/seedbench/seedbench.yaml b/lmms_eval/tasks/seedbench/seedbench.yaml new file mode 100644 index 00000000..a573f54b --- /dev/null +++ b/lmms_eval/tasks/seedbench/seedbench.yaml @@ -0,0 +1,27 @@ +dataset_path: lmms-lab/SEED-Bench +dataset_kwargs: + token: True +task: "seedbench" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.seed_doc_to_visual +doc_to_text: !function utils.seed_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" +# The return value of process_results will be used by metrics +process_results: !function utils.seed_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: seed_image + aggregation: !function utils.seed_aggregation_result + higher_is_better: true + - metric: seed_video + aggregation: !function utils.seed_aggregation_result + higher_is_better: true + - metric: seed_all + aggregation: !function utils.seed_aggregation_result + higher_is_better: true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/seedbench/seedbench_ppl.yaml b/lmms_eval/tasks/seedbench/seedbench_ppl.yaml new file mode 100644 index 00000000..ee522b5c --- /dev/null +++ b/lmms_eval/tasks/seedbench/seedbench_ppl.yaml @@ -0,0 +1,15 @@ +dataset_path: lmms-lab/SEED-Bench +dataset_kwargs: + token: True +task: "seedbench_ppl" +test_split: test +output_type: multiple_choice +doc_to_visual: !function utils.seed_doc_to_visual +doc_to_text: !function utils.seed_doc_to_text +doc_to_choice : !function utils.seed_doc_to_choice +doc_to_target: !function utils.seed_doc_to_mc_target +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: acc +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/seedbench/utils.py b/lmms_eval/tasks/seedbench/utils.py new file mode 100644 index 00000000..37943f04 --- /dev/null +++ b/lmms_eval/tasks/seedbench/utils.py @@ -0,0 +1,53 @@ +import json + + +def seed_doc_to_visual(doc): + return [image.convert("RGB") for image in doc["image"]] + + +def seed_doc_to_text(doc): + question = doc["question"] + question += "\n" + f"A.{doc['choice_a']}\n" + question += "\n" + f"B.{doc['choice_b']}\n" + question += "\n" + f"C.{doc['choice_c']}\n" + question += "\n" + f"D.{doc['choice_d']}" + return f"{question}\nAnswer with the option's letter from the given choices directly." + + +def seed_process_result(doc, result): + pred = result[0].strip() + answer = doc["answer"] + data_type = doc["data_type"] + + return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}} + + +def seed_aggregation_result(results): + total_count = 0 + total_correct = 0 + for result in results: + if result["pred"] == result["answer"]: + total_correct += 1 + total_count += 1 + return total_correct / total_count + + +def seed_aggregation_result_all(results): + score = seed_aggregation_result(results) + stored_results = [] + for result in results: + stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]}) + with open("./seed_submission.json", "r") as f: + json.dump(stored_results, f, indent=4) + print("Storing files for seed_submission ...") + + return score + + +def seed_doc_to_choice(doc): + return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]] + + +def seed_doc_to_mc_target(doc): + answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"} + return doc[answer2choice[doc["answer"]]] diff --git a/lmms_eval/tasks/textcaps/textcaps.yaml b/lmms_eval/tasks/textcaps/textcaps.yaml new file mode 100644 index 00000000..25f1ee90 --- /dev/null +++ b/lmms_eval/tasks/textcaps/textcaps.yaml @@ -0,0 +1,3 @@ +group : textcaps +task: + - textcaps_caption \ No newline at end of file diff --git a/lmms_eval/tasks/textcaps/textcaps_test.yaml b/lmms_eval/tasks/textcaps/textcaps_test.yaml new file mode 100644 index 00000000..377eb442 --- /dev/null +++ b/lmms_eval/tasks/textcaps/textcaps_test.yaml @@ -0,0 +1,26 @@ +dataset_path: lmms-lab/TextCaps +dataset_kwargs: + token: True +task : "textcaps_test" +group : "textcaps_caption" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.textcaps_doc_to_visual +doc_to_text: !function utils.textcaps_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.textcaps_test_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: textcaps_passthrough + aggregation : !function utils.textcaps_test_aggregation_result + higher_is_better : true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/textcaps/textcaps_train.yaml b/lmms_eval/tasks/textcaps/textcaps_train.yaml new file mode 100644 index 00000000..6a7db24b --- /dev/null +++ b/lmms_eval/tasks/textcaps/textcaps_train.yaml @@ -0,0 +1,47 @@ +dataset_path: lmms-lab/TextCaps +dataset_kwargs: + token: True +task : "textcaps_train" +group : "textcaps_caption" +test_split: train +output_type: generate_until +doc_to_visual: !function utils.textcaps_doc_to_visual +doc_to_text: !function utils.textcaps_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.textcaps_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: textcaps_Bleu_4 + aggregation : !function utils.textcaps_bleu4 + higher_is_better : true + - metric: textcaps_Bleu_3 + aggregation : !function utils.textcaps_bleu3 + higher_is_better : true + - metric: textcaps_Bleu_2 + aggregation : !function utils.textcaps_bleu2 + higher_is_better : true + - metric: textcaps_Bleu_1 + aggregation : !function utils.textcaps_bleu1 + higher_is_better : true + - metric: textcaps_METEOR + aggregation : !function utils.textcaps_meteor + higher_is_better : true + - metric: textcaps_ROUGE_L + aggregation : !function utils.textcaps_rougel + higher_is_better : true + - metric: textcaps_CIDEr + aggregation : !function utils.textcaps_cider + higher_is_better : true + #- metric: textcaps_SPICE + # aggregation : !function utils.textcaps_spice + # higher_is_better : true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/textcaps/textcaps_val.yaml b/lmms_eval/tasks/textcaps/textcaps_val.yaml new file mode 100644 index 00000000..9daf613f --- /dev/null +++ b/lmms_eval/tasks/textcaps/textcaps_val.yaml @@ -0,0 +1,47 @@ +dataset_path: lmms-lab/TextCaps +dataset_kwargs: + token: True +task: "textcaps_val" +group : "textcaps_caption" +test_split: val +output_type: generate_until +doc_to_visual: !function utils.textcaps_doc_to_visual +doc_to_text: !function utils.textcaps_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.textcaps_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: textcaps_Bleu_4 + aggregation : !function utils.textcaps_bleu4 + higher_is_better : true + - metric: textcaps_Bleu_3 + aggregation : !function utils.textcaps_bleu3 + higher_is_better : true + - metric: textcaps_Bleu_2 + aggregation : !function utils.textcaps_bleu2 + higher_is_better : true + - metric: textcaps_Bleu_1 + aggregation : !function utils.textcaps_bleu1 + higher_is_better : true + - metric: textcaps_METEOR + aggregation : !function utils.textcaps_meteor + higher_is_better : true + - metric: textcaps_ROUGE_L + aggregation : !function utils.textcaps_rougel + higher_is_better : true + - metric: textcaps_CIDEr + aggregation : !function utils.textcaps_cider + higher_is_better : true + #- metric: textcaps_SPICE + # aggregation : !function utils.textcaps_spice + # higher_is_better : true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/textcaps/utils.py b/lmms_eval/tasks/textcaps/utils.py new file mode 100644 index 00000000..f34d9660 --- /dev/null +++ b/lmms_eval/tasks/textcaps/utils.py @@ -0,0 +1,149 @@ +import os +import json +from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice +from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer +from pycocotools.coco import COCO + +import logging + +eval_logger = logging.getLogger("lmms-eval") + +dir_name = os.path.dirname(os.path.abspath(__file__)) + +TEXTCAPS_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] + + +def textcaps_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def textcaps_doc_to_text(doc): + question = doc["question"] + return f"{question}\nAnswer the question using a single word or phrase." + + +def textcaps_process_result(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name, value: metric value + """ + pred = result[0] if len(result) > 0 else "" + + data_dict = {"answer": doc["caption_str"], "pred": pred, "image_id": doc["image_id"]} + + return {f"textcaps_{metric}": data_dict for metric in TEXTCAPS_METRICS} + + +def textcaps_aggregation_result(results, metric): + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] + scorers_dict = {s[1]: s for s in scorers} + + stored_results = [] + # In order to make the coco eval tools to successfully create index + # We need at least two dict in the dataset + # 'annotation' and 'images' + # 'annotation' exactly reproduce the original annotation + # 'images' however only need the image id which is contained in the file name + dataset = {"annotations": [], "images": []} + idx = 0 + for result in results: + stored_results.append({"image_id": result["image_id"], "caption": result["pred"]}) + for a in result["answer"]: + dataset["annotations"].append({"image_id": result["image_id"], "caption": a, "id": idx}) + idx += 1 + dataset["images"].append({"id": result["image_id"]}) + + coco = COCO() + # Manually create index here + coco.dataset = dataset + coco.createIndex() + + textcaps_result = coco.loadRes(stored_results) + textcaps_eval = COCOEvalCap(coco, textcaps_result) + + imgIds = textcaps_eval.params["image_id"] + gts = {} + res = {} + for imgId in imgIds: + gts[imgId] = textcaps_eval.coco.imgToAnns[imgId] + res[imgId] = textcaps_eval.cocoRes.imgToAnns[imgId] + + eval_logger.info("tokenization...") + tokenizer = PTBTokenizer() + gts = tokenizer.tokenize(gts) + res = tokenizer.tokenize(res) + + eval_logger.info(f"Computing {metric} scores...") + + score, scores = scorers_dict[metric][0].compute_score(gts, res) + # When metric is one of the Bleu, score will be a list + if type(score) == list: + n = int(metric.split("_")[-1]) + score = score[n - 1] + + if not os.path.exists("./captions_val2014_alg_results.json"): + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open("./captions_val2014_alg_results.json", "w") as f: + json.dump(stored_results, f, indent=4) + + return score + + +def textcaps_bleu4(results): + return textcaps_aggregation_result(results, "Bleu_4") + + +def textcaps_bleu3(results): + return textcaps_aggregation_result(results, "Bleu_3") + + +def textcaps_bleu2(results): + return textcaps_aggregation_result(results, "Bleu_2") + + +def textcaps_bleu1(results): + return textcaps_aggregation_result(results, "Bleu_1") + + +def textcaps_meteor(results): + return textcaps_aggregation_result(results, "METEOR") + + +def textcaps_rougel(results): + return textcaps_aggregation_result(results, "ROUGE_L") + + +def textcaps_cider(results): + return textcaps_aggregation_result(results, "CIDEr") + + +def textcaps_spice(results): + return textcaps_aggregation_result(results, "SPICE") + + +def textcaps_test_process_result(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case textcaps_passthrough), value: metric value + """ + return {"textcaps_passthrough": {"pred": result, "image_id": doc["image_id"]}} + + +def textcaps_test_aggregation_result(results): + stored_results = [] + for result in results: + stored_results.append({"image_id": result["image_id"], "caption": result["pred"]}) + + if not os.path.exists("./captions_test2014_alg_results.json"): + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open("./captions_test2014_alg_results.json", "w") as f: + json.dump(stored_results, f, indent=4) + + eval_logger.info("Your test result has been stored. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.") + return -1