[Dataset] Add SEED-Bench, TextCaps, NoCaps (EvolvingLMMs-Lab#12)

* Change coco from print to logger * Add llava loglikelihood * Add Nocaps support * Fix pass through function * Add textcaps support * Fix textcaps eval image_id * Add seedbench support * Add seedbench ppl evaluation * black lint
kangreen0210 · Jan 23, 2024 · b5ad3ed · b5ad3ed
1 parent b5984ac
commit b5ad3ed
Show file tree

Hide file tree

Showing 27 changed files with 680 additions and 48 deletions.
diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py
@@ -6,6 +6,7 @@
 import sklearn.metrics
 import random
 import evaluate
+import torch
 
 from lmms_eval.api.registry import register_metric, register_aggregation
 
@@ -20,11 +21,6 @@ def mean(arr):
     return sum(arr) / len(arr)
 
 
-@register_aggregation("sum")
-def mean(arr):
-    return sum(arr)
-
-
 @register_aggregation("median")
 def median(arr):
     return arr[len(arr) // 2]
@@ -35,6 +31,7 @@ def median(arr):
 @register_aggregation("perplexity")
 def perplexity(items):
     # return math.exp(-mean(items))
+    items = torch.exp(torch.tensor(items)).tolist()
     return sum(items) / len(items)
 
 

diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
@@ -9,6 +9,7 @@
 
 import datasets
 import numpy as np
+from PIL import ImageFile
 
 from typing import Union, List, Any
 from collections.abc import Callable
@@ -38,6 +39,10 @@
 
 eval_logger = logging.getLogger("lmms-eval")
 
+# HuggingfaceM4/NoCaps contains truncated image in test split
+# Include this inside code block to avoid error
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
 
 @dataclass
 class TaskConfig(dict):
@@ -837,7 +842,7 @@ def doc_to_choice(self, doc: Any) -> List[str]:
 
     def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
         if self.OUTPUT_TYPE == "loglikelihood":
-            arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual(doc))
+            arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split"))
         elif self.OUTPUT_TYPE == "loglikelihood_rolling":
             arguments = (self.doc_to_target(doc),)
         elif self.OUTPUT_TYPE == "multiple_choice":
@@ -846,11 +851,11 @@ def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instan
             if self.multiple_input:
                 # If there are multiple inputs, choices are placed in the ctx
                 cont = self.doc_to_target(doc)
-                arguments = [(ctx, f"{target_delimiter}{cont}") for ctx in choices]
+                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for ctx in choices]
             else:
                 # Otherwise they are placed in the continuation
-                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
-
+                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for cont in choices]
+            kwargs.pop("split")
             request_list = [
                 Instance(
                     request_type="loglikelihood",

diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
@@ -145,8 +145,10 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         res = []
         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
 
-        for contexts, continuation, visuals in [reg.args for reg in requests]:
+        for contexts, continuation, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
             # encode, pad, and truncate contexts for this batch
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
             if visuals:
                 image = process_images(visuals, self._image_processor, self._config)
                 if type(image) is list:
@@ -186,7 +188,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
             with torch.inference_mode():
                 outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True)
             loss = outputs["loss"]
-            loss = torch.exp(loss)
+            # loss = torch.exp(loss)
             logits = outputs["logits"]
             greedy_tokens = logits.argmax(dim=-1)
             cont_toks = input_ids[:, contxt_id.shape[1] :]  # [1, seq]

diff --git a/lmms_eval/tasks/coco/coco_train.yaml b/lmms_eval/tasks/coco/coco_train.yaml
@@ -40,8 +40,8 @@ metric_list:
   - metric: coco_CIDEr
     aggregation : !function utils.coco_cider
     higher_is_better : true
-  - metric: coco_SPICE
-    aggregation : !function utils.coco_spice
-    higher_is_better : true
+  #- metric: coco_SPICE
+  #  aggregation : !function utils.coco_spice
+  #  higher_is_better : true
 metadata:
   - version: 0.0
diff --git a/lmms_eval/tasks/coco/coco_val.yaml b/lmms_eval/tasks/coco/coco_val.yaml
@@ -40,8 +40,8 @@ metric_list:
   - metric: coco_CIDEr
     aggregation : !function utils.coco_cider
     higher_is_better : true
-  - metric: coco_SPICE
-    aggregation : !function utils.coco_spice
-    higher_is_better : true
+  #- metric: coco_SPICE
+  #  aggregation : !function utils.coco_spice
+  #  higher_is_better : true
 metadata:
   - version: 0.0
diff --git a/lmms_eval/tasks/coco/utils.py b/lmms_eval/tasks/coco/utils.py
@@ -4,9 +4,13 @@
 from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
 from pycocotools.coco import COCO
 
+import logging
+
+eval_logger = logging.getLogger("lmms-eval")
+
 dir_name = os.path.dirname(os.path.abspath(__file__))
 
-COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"]
+COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"]  # , "SPICE"]
 
 
 def coco_doc_to_visual(doc):
@@ -71,19 +75,24 @@ def coco_aggregation_result(results, metric):
         gts[imgId] = coco_eval.coco.imgToAnns[imgId]
         res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]
 
-    print("tokenization...")
+    eval_logger.info("tokenization...")
     tokenizer = PTBTokenizer()
     gts = tokenizer.tokenize(gts)
     res = tokenizer.tokenize(res)
 
-    print(f"Computing {metric} scores...")
+    eval_logger.info(f"Computing {metric} scores...")
 
     score, scores = scorers_dict[metric][0].compute_score(gts, res)
     # When metric is one of the Bleu, score will be a list
     if type(score) == list:
         n = int(metric.split("_")[-1])
         score = score[n - 1]
 
+    if not os.path.exists("./captions_val2014_alg_results.json"):
+        eval_logger.info("Storing prediction that can be submitted to the server ...")
+        with open("./captions_val2014_alg_results.json", "w") as f:
+            json.dump(stored_results, f, indent=4)
+
     return score
 
 
@@ -127,8 +136,21 @@ def coco_test_process_result(doc, result):
     Returns:
         a dictionary with key: metric name (in this case coco_passthrough), value: metric value
     """
-    return {"coco_passthrough": {"pred": result}}
+    question_id = doc["question_id"]
+    # The question id in our dataset is the image file itself
+    image_id = int(question_id.split("_")[-1].split(".")[0])
+    return {"coco_passthrough": {"pred": result, "image_id": image_id}}
 
 
 def coco_test_aggregation_result(results):
+    stored_results = []
+    for result in results:
+        stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
+
+    if not os.path.exists("./captions_test2014_alg_results.json"):
+        eval_logger.info("Storing prediction that can be submitted to the server ...")
+        with open("./captions_test2014_alg_results.json", "w") as f:
+            json.dump(stored_results, f, indent=4)
+
+    eval_logger.info("Your test result has been stored. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.")
     return -1
diff --git a/lmms_eval/tasks/nocaps/nocaps.yaml b/lmms_eval/tasks/nocaps/nocaps.yaml
@@ -0,0 +1,3 @@
+group : nocaps
+task:
+  - nocaps_caption
diff --git a/lmms_eval/tasks/nocaps/nocaps_test.yaml b/lmms_eval/tasks/nocaps/nocaps_test.yaml
@@ -0,0 +1,26 @@
+dataset_path: HuggingFaceM4/NoCaps
+dataset_kwargs:
+  token: True
+task : "nocaps_test"
+group : "nocaps_caption"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.nocaps_doc_to_visual
+doc_to_text: !function utils.nocaps_doc_to_text
+doc_to_target: "annotations_captions"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.nocaps_test_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: nocaps_passthrough 
+    aggregation : !function utils.nocaps_test_aggregation_result
+    higher_is_better : true
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/nocaps/nocaps_val.yaml b/lmms_eval/tasks/nocaps/nocaps_val.yaml
@@ -0,0 +1,47 @@
+dataset_path: HuggingFaceM4/NoCaps
+dataset_kwargs:
+  token: True
+task: "nocaps_val"
+group : "nocaps_caption"
+test_split: validation
+output_type: generate_until
+doc_to_visual: !function utils.nocaps_doc_to_visual
+doc_to_text: !function utils.nocaps_doc_to_text
+doc_to_target: "annotations_captions"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.nocaps_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: nocaps_Bleu_4 
+    aggregation : !function utils.nocaps_bleu4
+    higher_is_better : true
+  - metric: nocaps_Bleu_3
+    aggregation : !function utils.nocaps_bleu3
+    higher_is_better : true
+  - metric: nocaps_Bleu_2
+    aggregation : !function utils.nocaps_bleu2
+    higher_is_better : true
+  - metric: nocaps_Bleu_1
+    aggregation : !function utils.nocaps_bleu1
+    higher_is_better : true
+  - metric: nocaps_METEOR
+    aggregation : !function utils.nocaps_meteor
+    higher_is_better : true
+  - metric: nocaps_ROUGE_L
+    aggregation : !function utils.nocaps_rougel
+    higher_is_better : true
+  - metric: nocaps_CIDEr
+    aggregation : !function utils.nocaps_cider
+    higher_is_better : true
+  #- metric: nocaps_SPICE
+  #  aggregation : !function utils.nocaps_spice
+  #  higher_is_better : true
+metadata:
+  - version: 0.0