diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py
index a5cc7d68..687c4a65 100644
--- a/lmms_eval/api/metrics.py
+++ b/lmms_eval/api/metrics.py
@@ -6,6 +6,7 @@
 import sklearn.metrics
 import random
 import evaluate
+import torch
 
 from lmms_eval.api.registry import register_metric, register_aggregation
 
@@ -20,11 +21,6 @@ def mean(arr):
     return sum(arr) / len(arr)
 
 
-@register_aggregation("sum")
-def mean(arr):
-    return sum(arr)
-
-
 @register_aggregation("median")
 def median(arr):
     return arr[len(arr) // 2]
@@ -35,6 +31,7 @@ def median(arr):
 @register_aggregation("perplexity")
 def perplexity(items):
     # return math.exp(-mean(items))
+    items = torch.exp(torch.tensor(items)).tolist()
     return sum(items) / len(items)
 
 
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index f3c03d37..a9a3ddc0 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -9,6 +9,7 @@
 
 import datasets
 import numpy as np
+from PIL import ImageFile
 
 from typing import Union, List, Any
 from collections.abc import Callable
@@ -38,6 +39,10 @@
 
 eval_logger = logging.getLogger("lmms-eval")
 
+# HuggingfaceM4/NoCaps contains truncated image in test split
+# Include this inside code block to avoid error
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
 
 @dataclass
 class TaskConfig(dict):
@@ -837,7 +842,7 @@ def doc_to_choice(self, doc: Any) -> List[str]:
 
     def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
         if self.OUTPUT_TYPE == "loglikelihood":
-            arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual(doc))
+            arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split"))
         elif self.OUTPUT_TYPE == "loglikelihood_rolling":
             arguments = (self.doc_to_target(doc),)
         elif self.OUTPUT_TYPE == "multiple_choice":
@@ -846,11 +851,11 @@ def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instan
             if self.multiple_input:
                 # If there are multiple inputs, choices are placed in the ctx
                 cont = self.doc_to_target(doc)
-                arguments = [(ctx, f"{target_delimiter}{cont}") for ctx in choices]
+                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for ctx in choices]
             else:
                 # Otherwise they are placed in the continuation
-                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
-
+                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for cont in choices]
+            kwargs.pop("split")
             request_list = [
                 Instance(
                     request_type="loglikelihood",
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 0e11bab0..2033f093 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -145,8 +145,10 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         res = []
         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
 
-        for contexts, continuation, visuals in [reg.args for reg in requests]:
+        for contexts, continuation, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
             # encode, pad, and truncate contexts for this batch
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
             if visuals:
                 image = process_images(visuals, self._image_processor, self._config)
                 if type(image) is list:
@@ -186,7 +188,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
             with torch.inference_mode():
                 outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True)
             loss = outputs["loss"]
-            loss = torch.exp(loss)
+            # loss = torch.exp(loss)
             logits = outputs["logits"]
             greedy_tokens = logits.argmax(dim=-1)
             cont_toks = input_ids[:, contxt_id.shape[1] :]  # [1, seq]
diff --git a/lmms_eval/tasks/coco/coco_train.yaml b/lmms_eval/tasks/coco/coco_train.yaml
index 98419369..52be75f7 100644
--- a/lmms_eval/tasks/coco/coco_train.yaml
+++ b/lmms_eval/tasks/coco/coco_train.yaml
@@ -40,8 +40,8 @@ metric_list:
   - metric: coco_CIDEr
     aggregation : !function utils.coco_cider
     higher_is_better : true
-  - metric: coco_SPICE
-    aggregation : !function utils.coco_spice
-    higher_is_better : true
+  #- metric: coco_SPICE
+  #  aggregation : !function utils.coco_spice
+  #  higher_is_better : true
 metadata:
   - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/coco/coco_val.yaml b/lmms_eval/tasks/coco/coco_val.yaml
index 871937ff..264052ba 100644
--- a/lmms_eval/tasks/coco/coco_val.yaml
+++ b/lmms_eval/tasks/coco/coco_val.yaml
@@ -40,8 +40,8 @@ metric_list:
   - metric: coco_CIDEr
     aggregation : !function utils.coco_cider
     higher_is_better : true
-  - metric: coco_SPICE
-    aggregation : !function utils.coco_spice
-    higher_is_better : true
+  #- metric: coco_SPICE
+  #  aggregation : !function utils.coco_spice
+  #  higher_is_better : true
 metadata:
   - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/coco/utils.py b/lmms_eval/tasks/coco/utils.py
index 5a219f0a..e82c3590 100644
--- a/lmms_eval/tasks/coco/utils.py
+++ b/lmms_eval/tasks/coco/utils.py
@@ -4,9 +4,13 @@
 from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
 from pycocotools.coco import COCO
 
+import logging
+
+eval_logger = logging.getLogger("lmms-eval")
+
 dir_name = os.path.dirname(os.path.abspath(__file__))
 
-COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"]
+COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"]  # , "SPICE"]
 
 
 def coco_doc_to_visual(doc):
@@ -71,12 +75,12 @@ def coco_aggregation_result(results, metric):
         gts[imgId] = coco_eval.coco.imgToAnns[imgId]
         res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]
 
-    print("tokenization...")
+    eval_logger.info("tokenization...")
     tokenizer = PTBTokenizer()
     gts = tokenizer.tokenize(gts)
     res = tokenizer.tokenize(res)
 
-    print(f"Computing {metric} scores...")
+    eval_logger.info(f"Computing {metric} scores...")
 
     score, scores = scorers_dict[metric][0].compute_score(gts, res)
     # When metric is one of the Bleu, score will be a list
@@ -84,6 +88,11 @@ def coco_aggregation_result(results, metric):
         n = int(metric.split("_")[-1])
         score = score[n - 1]
 
+    if not os.path.exists("./captions_val2014_alg_results.json"):
+        eval_logger.info("Storing prediction that can be submitted to the server ...")
+        with open("./captions_val2014_alg_results.json", "w") as f:
+            json.dump(stored_results, f, indent=4)
+
     return score
 
 
@@ -127,8 +136,21 @@ def coco_test_process_result(doc, result):
     Returns:
         a dictionary with key: metric name (in this case coco_passthrough), value: metric value
     """
-    return {"coco_passthrough": {"pred": result}}
+    question_id = doc["question_id"]
+    # The question id in our dataset is the image file itself
+    image_id = int(question_id.split("_")[-1].split(".")[0])
+    return {"coco_passthrough": {"pred": result, "image_id": image_id}}
 
 
 def coco_test_aggregation_result(results):
+    stored_results = []
+    for result in results:
+        stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
+
+    if not os.path.exists("./captions_test2014_alg_results.json"):
+        eval_logger.info("Storing prediction that can be submitted to the server ...")
+        with open("./captions_test2014_alg_results.json", "w") as f:
+            json.dump(stored_results, f, indent=4)
+
+    eval_logger.info("Your test result has been stored. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.")
     return -1
diff --git a/lmms_eval/tasks/nocaps/nocaps.yaml b/lmms_eval/tasks/nocaps/nocaps.yaml
new file mode 100644
index 00000000..f85c250a
--- /dev/null
+++ b/lmms_eval/tasks/nocaps/nocaps.yaml
@@ -0,0 +1,3 @@
+group : nocaps
+task:
+  - nocaps_caption
\ No newline at end of file
diff --git a/lmms_eval/tasks/nocaps/nocaps_test.yaml b/lmms_eval/tasks/nocaps/nocaps_test.yaml
new file mode 100644
index 00000000..ec2de9fb
--- /dev/null
+++ b/lmms_eval/tasks/nocaps/nocaps_test.yaml
@@ -0,0 +1,26 @@
+dataset_path: HuggingFaceM4/NoCaps
+dataset_kwargs:
+  token: True
+task : "nocaps_test"
+group : "nocaps_caption"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.nocaps_doc_to_visual
+doc_to_text: !function utils.nocaps_doc_to_text
+doc_to_target: "annotations_captions"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.nocaps_test_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: nocaps_passthrough 
+    aggregation : !function utils.nocaps_test_aggregation_result
+    higher_is_better : true
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/nocaps/nocaps_val.yaml b/lmms_eval/tasks/nocaps/nocaps_val.yaml
new file mode 100644
index 00000000..3308eebd
--- /dev/null
+++ b/lmms_eval/tasks/nocaps/nocaps_val.yaml
@@ -0,0 +1,47 @@
+dataset_path: HuggingFaceM4/NoCaps
+dataset_kwargs:
+  token: True
+task: "nocaps_val"
+group : "nocaps_caption"
+test_split: validation
+output_type: generate_until
+doc_to_visual: !function utils.nocaps_doc_to_visual
+doc_to_text: !function utils.nocaps_doc_to_text
+doc_to_target: "annotations_captions"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.nocaps_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: nocaps_Bleu_4 
+    aggregation : !function utils.nocaps_bleu4
+    higher_is_better : true
+  - metric: nocaps_Bleu_3
+    aggregation : !function utils.nocaps_bleu3
+    higher_is_better : true
+  - metric: nocaps_Bleu_2
+    aggregation : !function utils.nocaps_bleu2
+    higher_is_better : true
+  - metric: nocaps_Bleu_1
+    aggregation : !function utils.nocaps_bleu1
+    higher_is_better : true
+  - metric: nocaps_METEOR
+    aggregation : !function utils.nocaps_meteor
+    higher_is_better : true
+  - metric: nocaps_ROUGE_L
+    aggregation : !function utils.nocaps_rougel
+    higher_is_better : true
+  - metric: nocaps_CIDEr
+    aggregation : !function utils.nocaps_cider
+    higher_is_better : true
+  #- metric: nocaps_SPICE
+  #  aggregation : !function utils.nocaps_spice
+  #  higher_is_better : true
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/nocaps/utils.py b/lmms_eval/tasks/nocaps/utils.py
new file mode 100644
index 00000000..4c996e27
--- /dev/null
+++ b/lmms_eval/tasks/nocaps/utils.py
@@ -0,0 +1,151 @@
+import os
+import json
+from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
+from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+from pycocotools.coco import COCO
+
+import logging
+
+eval_logger = logging.getLogger("lmms-eval")
+
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+NOCAPS_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"]  # , "SPICE"]
+
+
+def nocaps_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+
+def nocaps_doc_to_text(doc):
+    question = "Please carefully observe the image and come up with a caption for the image"
+    return f"{question}\nAnswer the question using a single word or phrase."
+
+
+def nocaps_process_result(doc, result):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name, value: metric value
+    """
+    pred = result[0]
+    # The question id in our dataset is the image file itself
+    image_id = doc["image_id"]
+
+    data_dict = {"answer": doc["annotations_captions"], "pred": pred, "image_id": image_id}
+
+    return {f"nocaps_{metric}": data_dict for metric in NOCAPS_METRICS}
+
+
+def nocaps_aggregation_result(results, metric):
+    scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]
+    scorers_dict = {s[1]: s for s in scorers}
+
+    stored_results = []
+    # In order to make the coco eval tools to successfully create index
+    # We need at least two dict in the dataset
+    # 'annotation' and 'images'
+    # 'annotation' exactly reproduce the original annotation
+    # 'images' however only need the image id which is contained in the file name
+    dataset = {"annotations": [], "images": []}
+    idx = 0
+    for result in results:
+        stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
+        for a in result["answer"]:
+            dataset["annotations"].append({"image_id": int(result["image_id"]), "caption": a, "id": idx})
+            idx += 1
+        dataset["images"].append({"id": result["image_id"]})
+
+    coco = COCO()
+    # Manually create index here
+    coco.dataset = dataset
+    coco.createIndex()
+
+    nocaps_result = coco.loadRes(stored_results)
+    nocaps_eval = COCOEvalCap(coco, nocaps_result)
+
+    imgIds = nocaps_eval.params["image_id"]
+    gts = {}
+    res = {}
+    for imgId in imgIds:
+        gts[imgId] = nocaps_eval.coco.imgToAnns[imgId]
+        res[imgId] = nocaps_eval.cocoRes.imgToAnns[imgId]
+
+    eval_logger.info("tokenization...")
+    tokenizer = PTBTokenizer()
+    gts = tokenizer.tokenize(gts)
+    res = tokenizer.tokenize(res)
+
+    eval_logger.info(f"Computing {metric} scores...")
+
+    score, scores = scorers_dict[metric][0].compute_score(gts, res)
+    # When metric is one of the Bleu, score will be a list
+    if type(score) == list:
+        n = int(metric.split("_")[-1])
+        score = score[n - 1]
+
+    if not os.path.exists("./captions_nocaps_val_alg_results.json"):
+        eval_logger.info("Storing prediction that can be submitted to the server ...")
+        with open("./captions_nocaps_val_alg_results.json", "w") as f:
+            json.dump(stored_results, f, indent=4)
+
+    return score
+
+
+def nocaps_bleu4(results):
+    return nocaps_aggregation_result(results, "Bleu_4")
+
+
+def nocaps_bleu3(results):
+    return nocaps_aggregation_result(results, "Bleu_3")
+
+
+def nocaps_bleu2(results):
+    return nocaps_aggregation_result(results, "Bleu_2")
+
+
+def nocaps_bleu1(results):
+    return nocaps_aggregation_result(results, "Bleu_1")
+
+
+def nocaps_meteor(results):
+    return nocaps_aggregation_result(results, "METEOR")
+
+
+def nocaps_rougel(results):
+    return nocaps_aggregation_result(results, "ROUGE_L")
+
+
+def nocaps_cider(results):
+    return nocaps_aggregation_result(results, "CIDEr")
+
+
+def nocaps_spice(results):
+    return nocaps_aggregation_result(results, "SPICE")
+
+
+def nocaps_test_process_result(doc, result):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case nocaps_passthrough), value: metric value
+    """
+    return {"nocaps_passthrough": {"pred": result, "image_id": doc["image_id"]}}
+
+
+def nocaps_test_aggregation_result(results):
+    stored_results = []
+    for result in results:
+        stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]})
+
+    if not os.path.exists("./captions_nocaps_test_alg_results.json"):
+        eval_logger.info("Storing prediction that can be submitted to the server ...")
+        with open("./captions_nocaps_val_alg_results.json", "w") as f:
+            json.dump(stored_results, f, indent=4)
+
+    eval_logger.info("Your test result has been stored. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.")
+    return -1
diff --git a/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml b/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml
index 9f4d3213..82fb9914 100644
--- a/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml
+++ b/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml
@@ -29,8 +29,8 @@ metric_list:
   - metric: refcoco_CIDEr
     aggregation : !function utils.refcoco_cider
     higher_is_better : true
-  - metric: refcoco_SPICE
-    aggregation : !function utils.refcoco_spice
-    higher_is_better : true
+  #- metric: refcoco_SPICE
+  #  aggregation : !function utils.refcoco_spice
+  #  higher_is_better : true
 metadata:
   version: '0.0'
\ No newline at end of file
diff --git a/lmms_eval/tasks/refcoco+/_default_template_seg_yaml b/lmms_eval/tasks/refcoco+/_default_template_seg_yaml
index 230d4a33..2c38d513 100644
--- a/lmms_eval/tasks/refcoco+/_default_template_seg_yaml
+++ b/lmms_eval/tasks/refcoco+/_default_template_seg_yaml
@@ -29,8 +29,8 @@ metric_list:
   - metric: refcoco_CIDEr
     aggregation : !function utils.refcoco_cider
     higher_is_better : true
-  - metric: refcoco_SPICE
-    aggregation : !function utils.refcoco_spice
-    higher_is_better : true
+  #- metric: refcoco_SPICE
+  #  aggregation : !function utils.refcoco_spice
+  #  higher_is_better : true
 metadata:
   version: '0.0'
\ No newline at end of file
diff --git a/lmms_eval/tasks/refcoco+/utils.py b/lmms_eval/tasks/refcoco+/utils.py
index 537eec2f..fbec626c 100644
--- a/lmms_eval/tasks/refcoco+/utils.py
+++ b/lmms_eval/tasks/refcoco+/utils.py
@@ -3,7 +3,11 @@
 from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
 from pycocotools.coco import COCO
 
-COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"]
+COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"]  # , "SPICE"]
+
+import logging
+
+eval_logger = logging.getLogger("lmms-eval")
 
 
 def refcoco_bbox_doc_to_visual(doc):
@@ -79,12 +83,12 @@ def refcoco_aggregation_result(results, metric):
         gts[imgId] = coco_eval.coco.imgToAnns[imgId]
         res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]
 
-    print("tokenization...")
+    eval_logger.info("tokenization...")
     tokenizer = PTBTokenizer()
     gts = tokenizer.tokenize(gts)
     res = tokenizer.tokenize(res)
 
-    print(f"Computing {metric} scores...")
+    eval_logger.info(f"Computing {metric} scores...")
 
     score, scores = scorers_dict[metric][0].compute_score(gts, res)
     # coco_eval.setEval(score, metric)
diff --git a/lmms_eval/tasks/refcoco/_default_template_bbox_yaml b/lmms_eval/tasks/refcoco/_default_template_bbox_yaml
index d45ffabd..b5cc6b0f 100644
--- a/lmms_eval/tasks/refcoco/_default_template_bbox_yaml
+++ b/lmms_eval/tasks/refcoco/_default_template_bbox_yaml
@@ -29,8 +29,8 @@ metric_list:
   - metric: refcoco_CIDEr
     aggregation : !function utils.refcoco_cider
     higher_is_better : true
-  - metric: refcoco_SPICE
-    aggregation : !function utils.refcoco_spice
-    higher_is_better : true
+  #- metric: refcoco_SPICE
+  #  aggregation : !function utils.refcoco_spice
+  #  higher_is_better : true
 metadata:
   version: '0.0'
\ No newline at end of file
diff --git a/lmms_eval/tasks/refcoco/_default_template_seg_yaml b/lmms_eval/tasks/refcoco/_default_template_seg_yaml
index 9ee5558d..85456541 100644
--- a/lmms_eval/tasks/refcoco/_default_template_seg_yaml
+++ b/lmms_eval/tasks/refcoco/_default_template_seg_yaml
@@ -29,8 +29,8 @@ metric_list:
   - metric: refcoco_CIDEr
     aggregation : !function utils.refcoco_cider
     higher_is_better : true
-  - metric: refcoco_SPICE
-    aggregation : !function utils.refcoco_spice
-    higher_is_better : true
+  #- metric: refcoco_SPICE
+  #  aggregation : !function utils.refcoco_spice
+  #  higher_is_better : true
 metadata:
   version: '0.0'
\ No newline at end of file
diff --git a/lmms_eval/tasks/refcoco/utils.py b/lmms_eval/tasks/refcoco/utils.py
index 01bcace4..ed2b1344 100644
--- a/lmms_eval/tasks/refcoco/utils.py
+++ b/lmms_eval/tasks/refcoco/utils.py
@@ -3,7 +3,11 @@
 from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
 from pycocotools.coco import COCO
 
-COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"]
+COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"]  # , "SPICE"]
+
+import logging
+
+eval_logger = logging.getLogger("lmms-eval")
 
 
 def refcoco_bbox_doc_to_visual(doc):
@@ -79,12 +83,12 @@ def refcoco_aggregation_result(results, metric):
         gts[imgId] = coco_eval.coco.imgToAnns[imgId]
         res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]
 
-    print("tokenization...")
+    eval_logger.info("tokenization...")
     tokenizer = PTBTokenizer()
     gts = tokenizer.tokenize(gts)
     res = tokenizer.tokenize(res)
 
-    print(f"Computing {metric} scores...")
+    eval_logger.info(f"Computing {metric} scores...")
 
     score, scores = scorers_dict[metric][0].compute_score(gts, res)
     # coco_eval.setEval(score, metric)
diff --git a/lmms_eval/tasks/refcocog/_default_template_bbox_yaml b/lmms_eval/tasks/refcocog/_default_template_bbox_yaml
index a88d7b65..9abd94f6 100644
--- a/lmms_eval/tasks/refcocog/_default_template_bbox_yaml
+++ b/lmms_eval/tasks/refcocog/_default_template_bbox_yaml
@@ -29,8 +29,8 @@ metric_list:
   - metric: refcoco_CIDEr
     aggregation : !function utils.refcoco_cider
     higher_is_better : true
-  - metric: refcoco_SPICE
-    aggregation : !function utils.refcoco_spice
-    higher_is_better : true
+  #- metric: refcoco_SPICE
+  #  aggregation : !function utils.refcoco_spice
+  #  higher_is_better : true
 metadata:
   version: '0.0'
\ No newline at end of file
diff --git a/lmms_eval/tasks/refcocog/_default_template_seg_yaml b/lmms_eval/tasks/refcocog/_default_template_seg_yaml
index 9ee5558d..85456541 100644
--- a/lmms_eval/tasks/refcocog/_default_template_seg_yaml
+++ b/lmms_eval/tasks/refcocog/_default_template_seg_yaml
@@ -29,8 +29,8 @@ metric_list:
   - metric: refcoco_CIDEr
     aggregation : !function utils.refcoco_cider
     higher_is_better : true
-  - metric: refcoco_SPICE
-    aggregation : !function utils.refcoco_spice
-    higher_is_better : true
+  #- metric: refcoco_SPICE
+  #  aggregation : !function utils.refcoco_spice
+  #  higher_is_better : true
 metadata:
   version: '0.0'
\ No newline at end of file
diff --git a/lmms_eval/tasks/refcocog/utils.py b/lmms_eval/tasks/refcocog/utils.py
index 537eec2f..fbec626c 100644
--- a/lmms_eval/tasks/refcocog/utils.py
+++ b/lmms_eval/tasks/refcocog/utils.py
@@ -3,7 +3,11 @@
 from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
 from pycocotools.coco import COCO
 
-COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr", "SPICE"]
+COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"]  # , "SPICE"]
+
+import logging
+
+eval_logger = logging.getLogger("lmms-eval")
 
 
 def refcoco_bbox_doc_to_visual(doc):
@@ -79,12 +83,12 @@ def refcoco_aggregation_result(results, metric):
         gts[imgId] = coco_eval.coco.imgToAnns[imgId]
         res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]
 
-    print("tokenization...")
+    eval_logger.info("tokenization...")
     tokenizer = PTBTokenizer()
     gts = tokenizer.tokenize(gts)
     res = tokenizer.tokenize(res)
 
-    print(f"Computing {metric} scores...")
+    eval_logger.info(f"Computing {metric} scores...")
 
     score, scores = scorers_dict[metric][0].compute_score(gts, res)
     # coco_eval.setEval(score, metric)
diff --git a/lmms_eval/tasks/seedbench/seedbench.yaml b/lmms_eval/tasks/seedbench/seedbench.yaml
new file mode 100644
index 00000000..a573f54b
--- /dev/null
+++ b/lmms_eval/tasks/seedbench/seedbench.yaml
@@ -0,0 +1,27 @@
+dataset_path: lmms-lab/SEED-Bench
+dataset_kwargs:
+  token: True
+task: "seedbench"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.seed_doc_to_visual
+doc_to_text: !function utils.seed_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+# The return value of process_results will be used by metrics
+process_results: !function utils.seed_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: seed_image
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seed_video
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seed_all
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/seedbench/seedbench_ppl.yaml b/lmms_eval/tasks/seedbench/seedbench_ppl.yaml
new file mode 100644
index 00000000..ee522b5c
--- /dev/null
+++ b/lmms_eval/tasks/seedbench/seedbench_ppl.yaml
@@ -0,0 +1,15 @@
+dataset_path: lmms-lab/SEED-Bench
+dataset_kwargs:
+  token: True
+task: "seedbench_ppl"
+test_split: test
+output_type: multiple_choice
+doc_to_visual: !function utils.seed_doc_to_visual
+doc_to_text: !function utils.seed_doc_to_text
+doc_to_choice : !function utils.seed_doc_to_choice
+doc_to_target: !function utils.seed_doc_to_mc_target
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: acc
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/seedbench/utils.py b/lmms_eval/tasks/seedbench/utils.py
new file mode 100644
index 00000000..37943f04
--- /dev/null
+++ b/lmms_eval/tasks/seedbench/utils.py
@@ -0,0 +1,53 @@
+import json
+
+
+def seed_doc_to_visual(doc):
+    return [image.convert("RGB") for image in doc["image"]]
+
+
+def seed_doc_to_text(doc):
+    question = doc["question"]
+    question += "\n" + f"A.{doc['choice_a']}\n"
+    question += "\n" + f"B.{doc['choice_b']}\n"
+    question += "\n" + f"C.{doc['choice_c']}\n"
+    question += "\n" + f"D.{doc['choice_d']}"
+    return f"{question}\nAnswer with the option's letter from the given choices directly."
+
+
+def seed_process_result(doc, result):
+    pred = result[0].strip()
+    answer = doc["answer"]
+    data_type = doc["data_type"]
+
+    return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
+
+
+def seed_aggregation_result(results):
+    total_count = 0
+    total_correct = 0
+    for result in results:
+        if result["pred"] == result["answer"]:
+            total_correct += 1
+        total_count += 1
+    return total_correct / total_count
+
+
+def seed_aggregation_result_all(results):
+    score = seed_aggregation_result(results)
+    stored_results = []
+    for result in results:
+        stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
+    with open("./seed_submission.json", "r") as f:
+        json.dump(stored_results, f, indent=4)
+    print("Storing files for seed_submission ...")
+
+    return score
+
+
+def seed_doc_to_choice(doc):
+    return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]]
+
+
+def seed_doc_to_mc_target(doc):
+    answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"}
+    return doc[answer2choice[doc["answer"]]]
diff --git a/lmms_eval/tasks/textcaps/textcaps.yaml b/lmms_eval/tasks/textcaps/textcaps.yaml
new file mode 100644
index 00000000..25f1ee90
--- /dev/null
+++ b/lmms_eval/tasks/textcaps/textcaps.yaml
@@ -0,0 +1,3 @@
+group : textcaps
+task:
+  - textcaps_caption
\ No newline at end of file
diff --git a/lmms_eval/tasks/textcaps/textcaps_test.yaml b/lmms_eval/tasks/textcaps/textcaps_test.yaml
new file mode 100644
index 00000000..377eb442
--- /dev/null
+++ b/lmms_eval/tasks/textcaps/textcaps_test.yaml
@@ -0,0 +1,26 @@
+dataset_path: lmms-lab/TextCaps
+dataset_kwargs:
+  token: True
+task : "textcaps_test"
+group : "textcaps_caption"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.textcaps_doc_to_visual
+doc_to_text: !function utils.textcaps_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.textcaps_test_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: textcaps_passthrough 
+    aggregation : !function utils.textcaps_test_aggregation_result
+    higher_is_better : true
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/textcaps/textcaps_train.yaml b/lmms_eval/tasks/textcaps/textcaps_train.yaml
new file mode 100644
index 00000000..6a7db24b
--- /dev/null
+++ b/lmms_eval/tasks/textcaps/textcaps_train.yaml
@@ -0,0 +1,47 @@
+dataset_path: lmms-lab/TextCaps
+dataset_kwargs:
+  token: True
+task : "textcaps_train"
+group : "textcaps_caption"
+test_split: train
+output_type: generate_until
+doc_to_visual: !function utils.textcaps_doc_to_visual
+doc_to_text: !function utils.textcaps_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.textcaps_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: textcaps_Bleu_4 
+    aggregation : !function utils.textcaps_bleu4
+    higher_is_better : true
+  - metric: textcaps_Bleu_3
+    aggregation : !function utils.textcaps_bleu3
+    higher_is_better : true
+  - metric: textcaps_Bleu_2
+    aggregation : !function utils.textcaps_bleu2
+    higher_is_better : true
+  - metric: textcaps_Bleu_1
+    aggregation : !function utils.textcaps_bleu1
+    higher_is_better : true
+  - metric: textcaps_METEOR
+    aggregation : !function utils.textcaps_meteor
+    higher_is_better : true
+  - metric: textcaps_ROUGE_L
+    aggregation : !function utils.textcaps_rougel
+    higher_is_better : true
+  - metric: textcaps_CIDEr
+    aggregation : !function utils.textcaps_cider
+    higher_is_better : true
+  #- metric: textcaps_SPICE
+  #  aggregation : !function utils.textcaps_spice
+  #  higher_is_better : true
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/textcaps/textcaps_val.yaml b/lmms_eval/tasks/textcaps/textcaps_val.yaml
new file mode 100644
index 00000000..9daf613f
--- /dev/null
+++ b/lmms_eval/tasks/textcaps/textcaps_val.yaml
@@ -0,0 +1,47 @@
+dataset_path: lmms-lab/TextCaps
+dataset_kwargs:
+  token: True
+task: "textcaps_val"
+group : "textcaps_caption"
+test_split: val
+output_type: generate_until
+doc_to_visual: !function utils.textcaps_doc_to_visual
+doc_to_text: !function utils.textcaps_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.textcaps_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: textcaps_Bleu_4 
+    aggregation : !function utils.textcaps_bleu4
+    higher_is_better : true
+  - metric: textcaps_Bleu_3
+    aggregation : !function utils.textcaps_bleu3
+    higher_is_better : true
+  - metric: textcaps_Bleu_2
+    aggregation : !function utils.textcaps_bleu2
+    higher_is_better : true
+  - metric: textcaps_Bleu_1
+    aggregation : !function utils.textcaps_bleu1
+    higher_is_better : true
+  - metric: textcaps_METEOR
+    aggregation : !function utils.textcaps_meteor
+    higher_is_better : true
+  - metric: textcaps_ROUGE_L
+    aggregation : !function utils.textcaps_rougel
+    higher_is_better : true
+  - metric: textcaps_CIDEr
+    aggregation : !function utils.textcaps_cider
+    higher_is_better : true
+  #- metric: textcaps_SPICE
+  #  aggregation : !function utils.textcaps_spice
+  #  higher_is_better : true
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/textcaps/utils.py b/lmms_eval/tasks/textcaps/utils.py
new file mode 100644
index 00000000..f34d9660
--- /dev/null
+++ b/lmms_eval/tasks/textcaps/utils.py
@@ -0,0 +1,149 @@
+import os
+import json
+from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
+from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+from pycocotools.coco import COCO
+
+import logging
+
+eval_logger = logging.getLogger("lmms-eval")
+
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+TEXTCAPS_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"]  # , "SPICE"]
+
+
+def textcaps_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+
+def textcaps_doc_to_text(doc):
+    question = doc["question"]
+    return f"{question}\nAnswer the question using a single word or phrase."
+
+
+def textcaps_process_result(doc, result):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name, value: metric value
+    """
+    pred = result[0] if len(result) > 0 else ""
+
+    data_dict = {"answer": doc["caption_str"], "pred": pred, "image_id": doc["image_id"]}
+
+    return {f"textcaps_{metric}": data_dict for metric in TEXTCAPS_METRICS}
+
+
+def textcaps_aggregation_result(results, metric):
+    scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]
+    scorers_dict = {s[1]: s for s in scorers}
+
+    stored_results = []
+    # In order to make the coco eval tools to successfully create index
+    # We need at least two dict in the dataset
+    # 'annotation' and 'images'
+    # 'annotation' exactly reproduce the original annotation
+    # 'images' however only need the image id which is contained in the file name
+    dataset = {"annotations": [], "images": []}
+    idx = 0
+    for result in results:
+        stored_results.append({"image_id": result["image_id"], "caption": result["pred"]})
+        for a in result["answer"]:
+            dataset["annotations"].append({"image_id": result["image_id"], "caption": a, "id": idx})
+            idx += 1
+        dataset["images"].append({"id": result["image_id"]})
+
+    coco = COCO()
+    # Manually create index here
+    coco.dataset = dataset
+    coco.createIndex()
+
+    textcaps_result = coco.loadRes(stored_results)
+    textcaps_eval = COCOEvalCap(coco, textcaps_result)
+
+    imgIds = textcaps_eval.params["image_id"]
+    gts = {}
+    res = {}
+    for imgId in imgIds:
+        gts[imgId] = textcaps_eval.coco.imgToAnns[imgId]
+        res[imgId] = textcaps_eval.cocoRes.imgToAnns[imgId]
+
+    eval_logger.info("tokenization...")
+    tokenizer = PTBTokenizer()
+    gts = tokenizer.tokenize(gts)
+    res = tokenizer.tokenize(res)
+
+    eval_logger.info(f"Computing {metric} scores...")
+
+    score, scores = scorers_dict[metric][0].compute_score(gts, res)
+    # When metric is one of the Bleu, score will be a list
+    if type(score) == list:
+        n = int(metric.split("_")[-1])
+        score = score[n - 1]
+
+    if not os.path.exists("./captions_val2014_alg_results.json"):
+        eval_logger.info("Storing prediction that can be submitted to the server ...")
+        with open("./captions_val2014_alg_results.json", "w") as f:
+            json.dump(stored_results, f, indent=4)
+
+    return score
+
+
+def textcaps_bleu4(results):
+    return textcaps_aggregation_result(results, "Bleu_4")
+
+
+def textcaps_bleu3(results):
+    return textcaps_aggregation_result(results, "Bleu_3")
+
+
+def textcaps_bleu2(results):
+    return textcaps_aggregation_result(results, "Bleu_2")
+
+
+def textcaps_bleu1(results):
+    return textcaps_aggregation_result(results, "Bleu_1")
+
+
+def textcaps_meteor(results):
+    return textcaps_aggregation_result(results, "METEOR")
+
+
+def textcaps_rougel(results):
+    return textcaps_aggregation_result(results, "ROUGE_L")
+
+
+def textcaps_cider(results):
+    return textcaps_aggregation_result(results, "CIDEr")
+
+
+def textcaps_spice(results):
+    return textcaps_aggregation_result(results, "SPICE")
+
+
+def textcaps_test_process_result(doc, result):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case textcaps_passthrough), value: metric value
+    """
+    return {"textcaps_passthrough": {"pred": result, "image_id": doc["image_id"]}}
+
+
+def textcaps_test_aggregation_result(results):
+    stored_results = []
+    for result in results:
+        stored_results.append({"image_id": result["image_id"], "caption": result["pred"]})
+
+    if not os.path.exists("./captions_test2014_alg_results.json"):
+        eval_logger.info("Storing prediction that can be submitted to the server ...")
+        with open("./captions_test2014_alg_results.json", "w") as f:
+            json.dump(stored_results, f, indent=4)
+
+    eval_logger.info("Your test result has been stored. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.")
+    return -1