Skip to content

Commit

Permalink
[Dataset] Add SEED-Bench, TextCaps, NoCaps (EvolvingLMMs-Lab#12)
Browse files Browse the repository at this point in the history
* Change coco from print to logger

* Add llava loglikelihood

* Add Nocaps support

* Fix pass through function

* Add textcaps support

* Fix textcaps eval image_id

* Add seedbench support

* Add seedbench ppl evaluation

* black lint
  • Loading branch information
kcz358 authored Jan 23, 2024
1 parent b5984ac commit b5ad3ed
Show file tree
Hide file tree
Showing 27 changed files with 680 additions and 48 deletions.
7 changes: 2 additions & 5 deletions lmms_eval/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sklearn.metrics
import random
import evaluate
import torch

from lmms_eval.api.registry import register_metric, register_aggregation

Expand All @@ -20,11 +21,6 @@ def mean(arr):
return sum(arr) / len(arr)


@register_aggregation("sum")
def mean(arr):
return sum(arr)


@register_aggregation("median")
def median(arr):
return arr[len(arr) // 2]
Expand All @@ -35,6 +31,7 @@ def median(arr):
@register_aggregation("perplexity")
def perplexity(items):
# return math.exp(-mean(items))
items = torch.exp(torch.tensor(items)).tolist()
return sum(items) / len(items)


Expand Down
13 changes: 9 additions & 4 deletions lmms_eval/api/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import datasets
import numpy as np
from PIL import ImageFile

from typing import Union, List, Any
from collections.abc import Callable
Expand Down Expand Up @@ -38,6 +39,10 @@

eval_logger = logging.getLogger("lmms-eval")

# HuggingfaceM4/NoCaps contains truncated image in test split
# Include this inside code block to avoid error
ImageFile.LOAD_TRUNCATED_IMAGES = True


@dataclass
class TaskConfig(dict):
Expand Down Expand Up @@ -837,7 +842,7 @@ def doc_to_choice(self, doc: Any) -> List[str]:

def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
if self.OUTPUT_TYPE == "loglikelihood":
arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual(doc))
arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split"))
elif self.OUTPUT_TYPE == "loglikelihood_rolling":
arguments = (self.doc_to_target(doc),)
elif self.OUTPUT_TYPE == "multiple_choice":
Expand All @@ -846,11 +851,11 @@ def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instan
if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc)
arguments = [(ctx, f"{target_delimiter}{cont}") for ctx in choices]
arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for ctx in choices]
else:
# Otherwise they are placed in the continuation
arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]

arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for cont in choices]
kwargs.pop("split")
request_list = [
Instance(
request_type="loglikelihood",
Expand Down
6 changes: 4 additions & 2 deletions lmms_eval/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,10 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
res = []
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")

for contexts, continuation, visuals in [reg.args for reg in requests]:
for contexts, continuation, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
# encode, pad, and truncate contexts for this batch
visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
visuals = self.flatten(visuals)
if visuals:
image = process_images(visuals, self._image_processor, self._config)
if type(image) is list:
Expand Down Expand Up @@ -186,7 +188,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
with torch.inference_mode():
outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True)
loss = outputs["loss"]
loss = torch.exp(loss)
# loss = torch.exp(loss)
logits = outputs["logits"]
greedy_tokens = logits.argmax(dim=-1)
cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq]
Expand Down
6 changes: 3 additions & 3 deletions lmms_eval/tasks/coco/coco_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ metric_list:
- metric: coco_CIDEr
aggregation : !function utils.coco_cider
higher_is_better : true
- metric: coco_SPICE
aggregation : !function utils.coco_spice
higher_is_better : true
#- metric: coco_SPICE
# aggregation : !function utils.coco_spice
# higher_is_better : true
metadata:
- version: 0.0
6 changes: 3 additions & 3 deletions lmms_eval/tasks/coco/coco_val.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ metric_list:
- metric: coco_CIDEr
aggregation : !function utils.coco_cider
higher_is_better : true
- metric: coco_SPICE
aggregation : !function utils.coco_spice
higher_is_better : true
#- metric: coco_SPICE
# aggregation : !function utils.coco_spice
# higher_is_better : true
metadata:
- version: 0.0
30 changes: 26 additions & 4 deletions lmms_eval/tasks/coco/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocotools.coco import COCO

import logging

eval_logger = logging.getLogger("lmms-eval")

dir_name = os.path.dirname(os.path.abspath(__file__))

COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"]
COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]


def coco_doc_to_visual(doc):
Expand Down Expand Up @@ -71,19 +75,24 @@ def coco_aggregation_result(results, metric):
gts[imgId] = coco_eval.coco.imgToAnns[imgId]
res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]

print("tokenization...")
eval_logger.info("tokenization...")
tokenizer = PTBTokenizer()
gts = tokenizer.tokenize(gts)
res = tokenizer.tokenize(res)

print(f"Computing {metric} scores...")
eval_logger.info(f"Computing {metric} scores...")

score, scores = scorers_dict[metric][0].compute_score(gts, res)
# When metric is one of the Bleu, score will be a list
if type(score) == list:
n = int(metric.split("_")[-1])
score = score[n - 1]

if not os.path.exists("./captions_val2014_alg_results.json"):
eval_logger.info("Storing prediction that can be submitted to the server ...")
with open("./captions_val2014_alg_results.json", "w") as f:
json.dump(stored_results, f, indent=4)

return score


Expand Down Expand Up @@ -127,8 +136,21 @@ def coco_test_process_result(doc, result):
Returns:
a dictionary with key: metric name (in this case coco_passthrough), value: metric value
"""
return {"coco_passthrough": {"pred": result}}
question_id = doc["question_id"]
# The question id in our dataset is the image file itself
image_id = int(question_id.split("_")[-1].split(".")[0])
return {"coco_passthrough": {"pred": result, "image_id": image_id}}


def coco_test_aggregation_result(results):
stored_results = []
for result in results:
stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})

if not os.path.exists("./captions_test2014_alg_results.json"):
eval_logger.info("Storing prediction that can be submitted to the server ...")
with open("./captions_test2014_alg_results.json", "w") as f:
json.dump(stored_results, f, indent=4)

eval_logger.info("Your test result has been stored. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.")
return -1
3 changes: 3 additions & 0 deletions lmms_eval/tasks/nocaps/nocaps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
group : nocaps
task:
- nocaps_caption
26 changes: 26 additions & 0 deletions lmms_eval/tasks/nocaps/nocaps_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
dataset_path: HuggingFaceM4/NoCaps
dataset_kwargs:
token: True
task : "nocaps_test"
group : "nocaps_caption"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.nocaps_doc_to_visual
doc_to_text: !function utils.nocaps_doc_to_text
doc_to_target: "annotations_captions"
generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function utils.nocaps_test_process_result
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: nocaps_passthrough
aggregation : !function utils.nocaps_test_aggregation_result
higher_is_better : true
metadata:
- version: 0.0
47 changes: 47 additions & 0 deletions lmms_eval/tasks/nocaps/nocaps_val.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
dataset_path: HuggingFaceM4/NoCaps
dataset_kwargs:
token: True
task: "nocaps_val"
group : "nocaps_caption"
test_split: validation
output_type: generate_until
doc_to_visual: !function utils.nocaps_doc_to_visual
doc_to_text: !function utils.nocaps_doc_to_text
doc_to_target: "annotations_captions"
generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function utils.nocaps_process_result
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: nocaps_Bleu_4
aggregation : !function utils.nocaps_bleu4
higher_is_better : true
- metric: nocaps_Bleu_3
aggregation : !function utils.nocaps_bleu3
higher_is_better : true
- metric: nocaps_Bleu_2
aggregation : !function utils.nocaps_bleu2
higher_is_better : true
- metric: nocaps_Bleu_1
aggregation : !function utils.nocaps_bleu1
higher_is_better : true
- metric: nocaps_METEOR
aggregation : !function utils.nocaps_meteor
higher_is_better : true
- metric: nocaps_ROUGE_L
aggregation : !function utils.nocaps_rougel
higher_is_better : true
- metric: nocaps_CIDEr
aggregation : !function utils.nocaps_cider
higher_is_better : true
#- metric: nocaps_SPICE
# aggregation : !function utils.nocaps_spice
# higher_is_better : true
metadata:
- version: 0.0
Loading

0 comments on commit b5ad3ed

Please sign in to comment.