From 2571960654160942632b939c0f3582e5bd4d4ebd Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Wed, 27 Mar 2024 23:54:25 +0800
Subject: [PATCH 01/38] few_shot dataset loading

---
 lmms_eval/api/task.py                         | 19 +++++++++++++++----
 lmms_eval/tasks/flickr30k/flickr30k_test.yaml |  9 ++++++++-
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 3262937c..aa6edf9a 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -64,7 +64,7 @@ class TaskConfig(dict):
     training_split: str = None
     validation_split: str = None
     test_split: str = None
-    fewshot_split: str = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    # fewshot_split: str = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
     # formatting / prompting options.
     # see docs/advanced_task_guide.md for more info
     process_docs: Callable = None
@@ -550,8 +550,12 @@ def __init__(self, model_name) -> None:  # TODO no super() call here
                 self._filters.append(filter_pipeline)
         else:
             self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        if self.config.task == "flickr30k_test":
+            pass  # TODO: for test, will delete later
         if self.config.fewshot_config is not None:
-            self.sampler = samplers.get_sampler(self.config.fewshot_config.get("sampler", "default") if self.config.fewshot_config else "default")(list(self.fewshot_docs()), self, rnd=random.Random(1234))
+            random_seed = self.config.fewshot_config.get("random_seed", 1234)
+            sampler_function = samplers.get_sampler(self.config.fewshot_config.get("sampler", "default") if self.config.fewshot_config else "default")
+            self.sampler = sampler_function(list(self.fewshot_docs()), self, rnd=random.Random(random_seed))
 
         if self.has_test_docs():
             self.task_docs = self.test_docs()
@@ -742,8 +746,15 @@ def test_docs(self) -> datasets.Dataset:
             return self.dataset[self.config.test_split]
 
     def fewshot_docs(self):
-        if self.config.fewshot_split is not None:
-            return self.dataset[self.config.fewshot_split]
+        if "fewshot_dataset" in self.config.fewshot_config:
+            fewshot_dataset_config = self.config.fewshot_config["fewshot_dataset"]
+            return datasets.load_dataset(
+                path=fewshot_dataset_config["dataset_path"],
+                name=fewshot_dataset_config.get("dataset_name", None),
+                split=fewshot_dataset_config["fewshot_split"],
+                download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
+                **fewshot_dataset_config["dataset_kwargs"] if "dataset_kwargs" in fewshot_dataset_config else {},
+            )
         else:
             if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
                 eval_logger.warning(f"Task '{self.config.task}': " "num_fewshot > 0 but fewshot_split is None. " "using preconfigured rule.")
diff --git a/lmms_eval/tasks/flickr30k/flickr30k_test.yaml b/lmms_eval/tasks/flickr30k/flickr30k_test.yaml
index 737d9ff4..776eae39 100644
--- a/lmms_eval/tasks/flickr30k/flickr30k_test.yaml
+++ b/lmms_eval/tasks/flickr30k/flickr30k_test.yaml
@@ -1,7 +1,7 @@
 dataset_path: lmms-lab/flickr30k
 dataset_kwargs:
   token: True
-task : "flickr30k_test"
+task: "flickr30k_test"
 test_split: test
 output_type: generate_until
 doc_to_visual: !function utils.flickr_doc_to_visual
@@ -40,5 +40,12 @@ metric_list:
   #- metric: flickr_SPICE
   #  aggregation : !function utils.flickr_spice
   #  higher_is_better : true
+fewshot_config:
+  fewshot_sampler: default
+  fewshot_dataset:
+    dataset_path: lmms-lab/flickr30k
+    # dataset_name: 
+    split: train
+  # random_seed: 1234
 metadata:
   - version: 0.0
\ No newline at end of file

From 5dca4e1da15d24908a76cb296259a1c965d8ef93 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Thu, 28 Mar 2024 00:58:56 +0800
Subject: [PATCH 02/38] Fix fewshot_config sampler constructor error

---
 lmms_eval/api/task.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index aa6edf9a..9f70056f 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -553,9 +553,12 @@ def __init__(self, model_name) -> None:  # TODO no super() call here
         if self.config.task == "flickr30k_test":
             pass  # TODO: for test, will delete later
         if self.config.fewshot_config is not None:
-            random_seed = self.config.fewshot_config.get("random_seed", 1234)
-            sampler_function = samplers.get_sampler(self.config.fewshot_config.get("sampler", "default") if self.config.fewshot_config else "default")
-            self.sampler = sampler_function(list(self.fewshot_docs()), self, rnd=random.Random(random_seed))
+            try:
+                random_seed = self.config.fewshot_config.get("random_seed", 1234)
+                sampler_constructor = samplers.get_sampler(self.config.fewshot_config.get("sampler", "default") if self.config.fewshot_config else "default")
+                self.sampler = sampler_constructor(list(self.fewshot_docs()), self, rnd=random.Random(random_seed))
+            except Exception as e:
+                eval_logger.error(f"Error in fewshot_config: {e}")
 
         if self.has_test_docs():
             self.task_docs = self.test_docs()

From 5efc4cb4c4e95b7f5d845b125f0207d5cad1bf25 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Thu, 28 Mar 2024 01:58:44 +0800
Subject: [PATCH 03/38] few shot dataset lazy load

---
 lmms_eval/api/samplers.py | 19 +++++++++++++++++++
 lmms_eval/api/task.py     | 26 ++++++++++++++++----------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index f77065e8..e0688851 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -1,3 +1,22 @@
+import datasets
+
+
+class FewShotDataset(object):
+    def __init__(self, dataset=None, *, dataset_path: str = None, dataset_name: str = None, split: str = None, dataset_kwargs: dict = None):
+        if dataset is not None and (dataset_path is not None or dataset_name is not None or split is not None or dataset_kwargs is not None):
+            raise ValueError("Cannot provide both `dataset` and other dataset arguments!")
+        self.dataset_path = dataset_path
+        self.dataset_name = dataset_name
+        self.split = split
+        self.dataset = dataset
+        self.dataset_kwargs = dataset_kwargs if dataset_kwargs is not None else {}
+
+    def get_dataset(self):
+        if self.dataset is None:
+            self.dataset = datasets.load_dataset(path=self.dataset_path, name=self.dataset_name, split=self.split, download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, **self.dataset_kwargs)
+        return self.dataset
+
+
 class ContextSampler:
     def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
         self.rnd = rnd
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 9f70056f..4d03f320 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -22,6 +22,7 @@
 from lmms_eval import utils
 from lmms_eval.api import samplers
 from lmms_eval.api.instance import Instance
+from lmms_eval.api.samplers import FewShotDataset
 
 from lmms_eval.filters import build_filter_ensemble
 from lmms_eval.api.registry import (
@@ -305,13 +306,13 @@ def fewshot_docs(self):
             A iterable of any object, that doc_to_text can handle
         """
         if self.has_training_docs():
-            return self.training_docs()
+            return FewShotDataset(self.training_docs())
         elif self.has_validation_docs():
-            return self.validation_docs()
+            return FewShotDataset(self.validation_docs())
         else:
             if self.config.num_fewshot is not None:
                 eval_logger.warning("has_training_docs and has_validation_docs are False" ", using test_docs as fewshot_docs but this is not recommended.")
-            return self.test_docs()
+            return FewShotDataset(self.test_docs())
 
     def _process_doc(self, doc):
         """
@@ -550,8 +551,13 @@ def __init__(self, model_name) -> None:  # TODO no super() call here
                 self._filters.append(filter_pipeline)
         else:
             self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        ##########################################
+        # TODO: for test, will delete later
         if self.config.task == "flickr30k_test":
-            pass  # TODO: for test, will delete later
+            pass
+        else:
+            pass
+        ###########################################
         if self.config.fewshot_config is not None:
             try:
                 random_seed = self.config.fewshot_config.get("random_seed", 1234)
@@ -749,14 +755,14 @@ def test_docs(self) -> datasets.Dataset:
             return self.dataset[self.config.test_split]
 
     def fewshot_docs(self):
+
         if "fewshot_dataset" in self.config.fewshot_config:
             fewshot_dataset_config = self.config.fewshot_config["fewshot_dataset"]
-            return datasets.load_dataset(
-                path=fewshot_dataset_config["dataset_path"],
-                name=fewshot_dataset_config.get("dataset_name", None),
-                split=fewshot_dataset_config["fewshot_split"],
-                download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
-                **fewshot_dataset_config["dataset_kwargs"] if "dataset_kwargs" in fewshot_dataset_config else {},
+            return FewShotDataset(
+                dataset_path=fewshot_dataset_config["dataset_path"],
+                dataset_name=fewshot_dataset_config.get("dataset_name", None),
+                split=fewshot_dataset_config["split"],
+                fewshot_dataset_config=fewshot_dataset_config.get("dataset_kwargs", {}),
             )
         else:
             if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):

From 89e89e033cad4271f6506eb525ef6383d09212e1 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Thu, 28 Mar 2024 02:05:37 +0800
Subject: [PATCH 04/38] Fix sampler constructor in ConfigurableTask

---
 lmms_eval/api/task.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 4d03f320..96139475 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -562,7 +562,7 @@ def __init__(self, model_name) -> None:  # TODO no super() call here
             try:
                 random_seed = self.config.fewshot_config.get("random_seed", 1234)
                 sampler_constructor = samplers.get_sampler(self.config.fewshot_config.get("sampler", "default") if self.config.fewshot_config else "default")
-                self.sampler = sampler_constructor(list(self.fewshot_docs()), self, rnd=random.Random(random_seed))
+                self.sampler = sampler_constructor(self.fewshot_docs(), self, rnd=random.Random(random_seed))
             except Exception as e:
                 eval_logger.error(f"Error in fewshot_config: {e}")
 
@@ -762,7 +762,7 @@ def fewshot_docs(self):
                 dataset_path=fewshot_dataset_config["dataset_path"],
                 dataset_name=fewshot_dataset_config.get("dataset_name", None),
                 split=fewshot_dataset_config["split"],
-                fewshot_dataset_config=fewshot_dataset_config.get("dataset_kwargs", {}),
+                dataset_kwargs=fewshot_dataset_config.get("dataset_kwargs", {}),
             )
         else:
             if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):

From 96129e02fe1d21099b867ebdfefc01ed7ca12760 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Thu, 28 Mar 2024 17:17:34 +0800
Subject: [PATCH 05/38] Add same_as_eval parameter to FewShotDataset
 constructor

---
 lmms_eval/api/samplers.py                     | 21 ++++++++++++-------
 lmms_eval/api/task.py                         |  6 +++++-
 .../textvqa/_default_template_textvqa_yaml    |  7 +++++++
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index e0688851..c9084e9b 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -2,7 +2,7 @@
 
 
 class FewShotDataset(object):
-    def __init__(self, dataset=None, *, dataset_path: str = None, dataset_name: str = None, split: str = None, dataset_kwargs: dict = None):
+    def __init__(self, dataset=None, *, dataset_path: str = None, dataset_name: str = None, split: str = None, dataset_kwargs: dict = None, same_as_eval: bool = False):
         if dataset is not None and (dataset_path is not None or dataset_name is not None or split is not None or dataset_kwargs is not None):
             raise ValueError("Cannot provide both `dataset` and other dataset arguments!")
         self.dataset_path = dataset_path
@@ -10,15 +10,22 @@ def __init__(self, dataset=None, *, dataset_path: str = None, dataset_name: str
         self.split = split
         self.dataset = dataset
         self.dataset_kwargs = dataset_kwargs if dataset_kwargs is not None else {}
+        self.same_as_eval = same_as_eval
+        self.fewshot_indices = None
 
-    def get_dataset(self):
+    def get_dataset(self) -> datasets.Dataset:
         if self.dataset is None:
             self.dataset = datasets.load_dataset(path=self.dataset_path, name=self.dataset_name, split=self.split, download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, **self.dataset_kwargs)
+            if self.fewshot_indices:
+                self.dataset = self.dataset.select(self.fewshot_indices)
         return self.dataset
 
+    def __getitem__(self, item):
+        return self.get_dataset()[item]
+
 
 class ContextSampler:
-    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
+    def __init__(self, docs: FewShotDataset, task, fewshot_indices=None, rnd=None) -> None:
         self.rnd = rnd
         assert self.rnd, "must pass rnd to FewShotSampler!"
 
@@ -32,13 +39,13 @@ def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
         self.doc_to_target = self.task.doc_to_target
         self.doc_to_choice = self.task.doc_to_choice
 
-        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
+        self.docs: FewShotDataset = docs  # HF dataset split, provided by task._fewshot_docs()
         if fewshot_indices:  # subset few-shot docs from
-            self.docs = self.docs.select(fewshot_indices)
+            self.docs.fewshot_indices = fewshot_indices
 
     def get_context(self, doc, num_fewshot):
         # draw an extra fewshot sample if using same split as evaluating on
-        n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot
+        n_samples = num_fewshot + 1 if self.docs.same_as_eval else num_fewshot
 
         # draw `n_samples` docs from fewshot_docs
         fewshotex = self.sample(n_samples)
@@ -71,7 +78,7 @@ def sample(self, n):
         Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
         """
 
-        return self.rnd.sample(self.docs, n)
+        return self.rnd.sample(self.docs.get_dataset(), n)
 
 
 class FirstNSampler(ContextSampler):
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 96139475..2fff7252 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -312,7 +312,7 @@ def fewshot_docs(self):
         else:
             if self.config.num_fewshot is not None:
                 eval_logger.warning("has_training_docs and has_validation_docs are False" ", using test_docs as fewshot_docs but this is not recommended.")
-            return FewShotDataset(self.test_docs())
+            return FewShotDataset(self.test_docs(), same_as_eval=True)
 
     def _process_doc(self, doc):
         """
@@ -758,11 +758,15 @@ def fewshot_docs(self):
 
         if "fewshot_dataset" in self.config.fewshot_config:
             fewshot_dataset_config = self.config.fewshot_config["fewshot_dataset"]
+            if "dataset_path" not in fewshot_dataset_config:
+                fewshot_dataset_config["dataset_path"] = self.config.dataset_path
+            same_as_eval = self.config.dataset_path == fewshot_dataset_config["dataset_path"] and self.config.dataset_name == fewshot_dataset_config.get("dataset_name", None) and self.config.test_split == fewshot_dataset_config["split"]
             return FewShotDataset(
                 dataset_path=fewshot_dataset_config["dataset_path"],
                 dataset_name=fewshot_dataset_config.get("dataset_name", None),
                 split=fewshot_dataset_config["split"],
                 dataset_kwargs=fewshot_dataset_config.get("dataset_kwargs", {}),
+                same_as_eval=same_as_eval,
             )
         else:
             if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
diff --git a/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml b/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml
index 282b401b..fd485002 100644
--- a/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml
+++ b/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml
@@ -15,3 +15,10 @@ model_specific_prompt_kwargs:
   qwen_vl:
     pre_prompt: ""
     post_prompt: " Answer:"
+fewshot_config:
+  fewshot_sampler: default
+  fewshot_dataset:
+    # dataset_path: lmms-lab/flickr30k
+    # dataset_name: 
+    split: train
+  # random_seed: 1234

From 93f56d96cff14ce24cf4f80a8c3c711662368244 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Fri, 29 Mar 2024 17:05:04 +0800
Subject: [PATCH 06/38] fix sampling

---
 lmms_eval/api/samplers.py | 6 +++++-
 lmms_eval/api/task.py     | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index c9084e9b..9c7b3bf7 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -19,6 +19,10 @@ def get_dataset(self) -> datasets.Dataset:
             if self.fewshot_indices:
                 self.dataset = self.dataset.select(self.fewshot_indices)
         return self.dataset
+    
+    def sample(self, n, rnd):
+        indices = rnd.sample(range(len(self.get_dataset())), n)
+        return self.get_dataset().select(indices)
 
     def __getitem__(self, item):
         return self.get_dataset()[item]
@@ -78,7 +82,7 @@ def sample(self, n):
         Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
         """
 
-        return self.rnd.sample(self.docs.get_dataset(), n)
+        return self.docs.sample(n)
 
 
 class FirstNSampler(ContextSampler):
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 2fff7252..abd36470 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -553,7 +553,7 @@ def __init__(self, model_name) -> None:  # TODO no super() call here
             self._filters = [build_filter_ensemble("none", [["take_first", None]])]
         ##########################################
         # TODO: for test, will delete later
-        if self.config.task == "flickr30k_test":
+        if self.config.task == "textvqa_test":
             pass
         else:
             pass

From dd76e53378b8d0d2920ea76f74af9e259039d1c7 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Fri, 29 Mar 2024 17:07:24 +0800
Subject: [PATCH 07/38] remove dulpilicated code

---
 lmms_eval/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
index 0c666145..9e11028e 100644
--- a/lmms_eval/__main__.py
+++ b/lmms_eval/__main__.py
@@ -211,7 +211,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         except Exception as e:
             traceback.print_exc()
             eval_logger.error(f"Error during evaluation: {e}")
-            traceback.print_exc()
+            # traceback.print_exc()
             results_list.append(None)
 
     for args, results in zip(args_list, results_list):

From a4d86da662df32125244b11ce0454dd7b34ac2c3 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Fri, 29 Mar 2024 17:32:38 +0800
Subject: [PATCH 08/38] fix a small bug

---
 lmms_eval/api/samplers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index 9c7b3bf7..c2a175ba 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -82,7 +82,7 @@ def sample(self, n):
         Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
         """
 
-        return self.docs.sample(n)
+        return self.docs.sample(n, self.rnd)
 
 
 class FirstNSampler(ContextSampler):

From 70636cfd15a166c19254c2d74968f4c30129bbf6 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Fri, 29 Mar 2024 17:33:15 +0800
Subject: [PATCH 09/38] Merge commit '08d4151cea53725b3e016bf546b58bece8d51c38'

---
 README.md                                     |   4 +-
 llava_repr_requirements.txt                   |  19 +-
 lmms_eval/tasks/olympiadbench/cn_utils.py     |  69 ++++
 lmms_eval/tasks/olympiadbench/en_utils.py     |  69 ++++
 .../tasks/olympiadbench/olympiadbench.yaml    |   6 +
 .../olympiadbench/olympiadbench_evals.py      | 355 ++++++++++++++++++
 .../olympiadbench/olympiadbench_test_cn.yaml  |  25 ++
 .../olympiadbench/olympiadbench_test_en.yaml  |  25 ++
 pyproject.toml                                |   2 +-
 9 files changed, 570 insertions(+), 4 deletions(-)
 create mode 100644 lmms_eval/tasks/olympiadbench/cn_utils.py
 create mode 100644 lmms_eval/tasks/olympiadbench/en_utils.py
 create mode 100644 lmms_eval/tasks/olympiadbench/olympiadbench.yaml
 create mode 100644 lmms_eval/tasks/olympiadbench/olympiadbench_evals.py
 create mode 100644 lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml
 create mode 100644 lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml

diff --git a/README.md b/README.md
index 7b845f44..a2f8e720 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,9 @@
 🏠 [Homepage](https://lmms-lab.github.io/) |  🎉 [Blog](https://lmms-lab.github.io/lmms-eval-blog/lmms-eval-0.1/) | 📚 [Documentation](docs/README.md) | 🤗 [Huggingface Datasets](https://huggingface.co/lmms-lab) | <a href="https://emoji.gg/emoji/1684-discord-thread"><img src="https://cdn3.emoji.gg/emojis/1684-discord-thread.png" width="14px" height="14px" alt="Discord_Thread"></a> [discord/lmms-eval](https://discord.gg/ebAMGSsS)
 
 
-In today's world, we're on a thrilling quest for Artificial General Intelligence (AGI), driven by a passion that reminds us of the excitement surrounding the 1960s moon landing. At the heart of this adventure are the incredible large language models (LLMs) and large multimodal models (LMMs). These models are like brilliant minds that can understand, learn, and interact with a vast array of human tasks, marking a significant leap toward our goal.
+In today's world, we're on an exciting journey toward creating Artificial General Intelligence (AGI), much like the enthusiasm of the 1960s moon landing. This journey is powered by advanced large language models (LLMs) and large multimodal models (LMMs), which are complex systems capable of understanding, learning, and performing a wide variety of human tasks. These advancements bring us closer to achieving AGI.
 
-To truly understand how capable these models are, we've started to create and use a wide variety of evaluation benchmarks. These benchmarks help us map out a detailed chart of abilities, showing us how close we are to achieving true AGI. However, this journey is not without its challenges. The sheer number of benchmarks and datasets we need to look at is overwhelming. They're all over the place - tucked away in someone's Google Drive, scattered across Dropbox, and hidden in the corners of various school and research lab websites. It's like embarking on a treasure hunt where the maps are spread far and wide.
+To gauge how advanced these models are, we use a variety of evaluation benchmarks. These benchmarks are tools that help us understand the capabilities of these models, showing us how close we are to achieving AGI. However, finding and using these benchmarks is a big challenge. The necessary benchmarks and datasets are spread out and hidden in various places like Google Drive, Dropbox, and different school and research lab websites. It feels like we're on a treasure hunt, but the maps are scattered everywhere.
 
 In the field of language models, there has been a valuable precedent set by the work of [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). They offer integrated data and model interfaces, enabling rapid evaluation of language models and serving as the backend support framework for the [open-llm-leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), and has gradually become the underlying ecosystem of the era of foundation models.
 
diff --git a/llava_repr_requirements.txt b/llava_repr_requirements.txt
index 07afb686..e3f0f527 100644
--- a/llava_repr_requirements.txt
+++ b/llava_repr_requirements.txt
@@ -27,6 +27,23 @@ shortuuid==1.0.12
 sqlitedict==2.1.0
 tenacity==8.2.3
 torch==2.0.1
+openai>=1.0.0
+pycocoevalcap
 tokenizers==0.15.2
 tqdm==4.66.2
-transformers==4.37.2
\ No newline at end of file
+tqdm-multiprocess
+transformers==4.37.2
+zstandard
+pillow
+pyyaml
+sympy
+mpmath
+Jinja2
+openpyxl
+Levenshtein
+hf_transfer
+tenacity
+wandb>=0.16.0
+transformers-stream-generator
+tiktoken
+pre-commit
\ No newline at end of file
diff --git a/lmms_eval/tasks/olympiadbench/cn_utils.py b/lmms_eval/tasks/olympiadbench/cn_utils.py
new file mode 100644
index 00000000..34e5ce4d
--- /dev/null
+++ b/lmms_eval/tasks/olympiadbench/cn_utils.py
@@ -0,0 +1,69 @@
+import os
+import json
+import datetime
+from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+
+import logging
+eval_logger = logging.getLogger("lmms-eval")
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+olympiadbench_evaluator = OlympiadBenchEvaluator()
+
+def olympiadbench_doc_to_visual(doc):
+    return [image.convert("RGB") for image in doc["images"]]
+
+def olympiadbench_doc_to_text(doc):
+    question = doc["question"]
+    subject = doc["subfield"]
+    mul_ans = doc["is_multiple_answer"]
+    if mul_ans is None:
+        mul_ans = False
+    ans_type = doc["answer_type"]
+    if ans_type == "Need_human_evaluate":
+        ans_type = "proof based"
+
+    pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n"
+
+    post_prompt = ""
+    if not mul_ans:
+        post_prompt += f"答案类型为{ans_type}。\n"
+    else:
+        post_prompt += f"题目有多个答案，答案类型均为{ans_type}。\n"
+    post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以"
+    if not mul_ans:
+        post_prompt += '"所以最终答案是\\boxed{答案}。"\n'
+    else:
+        post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n'
+
+    final_question = pre_prompt + question + '\n' + post_prompt
+    return final_question
+
+def olympiadbench_process_results(doc, results):
+    precision = doc["error"]
+    is_proving = "TP" in doc["source"] 
+    if precision is None:
+        precision = 0
+    prediction = results[0].strip()
+
+    if is_proving:
+        return {
+            "submission": prediction
+        }
+    else:
+        prediction = prediction.split("所以最终答案是")[-1]
+        prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
+        accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
+        accuracy = int(accuracy)
+        return {
+            "exact_match": accuracy
+        }
+
+def olympiadbench_aggregate_results(results, args):
+    now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
+    submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json"
+    path = generate_submission_file(submission_file_name, args)
+    with open(path, "w") as f:
+        json.dump(results, f, ensure_ascii=False)
+    print(f"Submission file saved to {path}")
+    
\ No newline at end of file
diff --git a/lmms_eval/tasks/olympiadbench/en_utils.py b/lmms_eval/tasks/olympiadbench/en_utils.py
new file mode 100644
index 00000000..a21ee159
--- /dev/null
+++ b/lmms_eval/tasks/olympiadbench/en_utils.py
@@ -0,0 +1,69 @@
+import os
+import json
+import datetime
+from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+
+import logging
+eval_logger = logging.getLogger("lmms-eval")
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+olympiadbench_evaluator = OlympiadBenchEvaluator()
+
+def olympiadbench_doc_to_visual(doc):
+    return [image.convert("RGB") for image in doc["images"]]
+
+def olympiadbench_doc_to_text(doc):
+    question = doc["question"]
+    subject = doc["subfield"]
+    mul_ans = doc["is_multiple_answer"]
+    if mul_ans is None:
+        mul_ans = False
+    ans_type = doc["answer_type"]
+    if ans_type == "Need_human_evaluate":
+        ans_type = "proof based"
+
+    pre_prompt = f"The following is a question from an International {subject} competition.\n"
+
+    post_prompt = ""
+    if not mul_ans:
+        post_prompt += f"The answer of the question should be {ans_type}.\n"
+    else:
+        post_prompt += f"The question has multiple answers, each of them should be {ans_type}.\n"
+    post_prompt += "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "
+    if not mul_ans:
+        post_prompt += '"So the final answer is \\boxed{answer}."\n'
+    else:
+        post_prompt += 'So the final answer is \\boxed{multiple answers connected with commas}.\n'
+
+    final_question = pre_prompt + question + '\n' + post_prompt
+    return final_question
+
+def olympiadbench_process_results(doc, results):
+    precision = doc["error"]
+    is_proving = "TP" in doc["source"] 
+    if precision is None:
+        precision = 0
+    prediction = results[0].strip()
+
+    if is_proving:
+        return {
+            "submission": prediction
+        }
+    else:
+        prediction = prediction.split("final answer is")[-1]
+        prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
+        accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
+        accuracy = int(accuracy)
+        return {
+            "exact_match": accuracy
+        }
+
+def olympiadbench_aggregate_results(results, args):
+    now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
+    submission_file_name = f"olympiadbench-test-en-submission-{now_date_time}.json"
+    path = generate_submission_file(submission_file_name, args)
+    with open(path, "w") as f:
+        json.dump(results, f, ensure_ascii=False)
+    print(f"Submission file saved to {path}")
+    
\ No newline at end of file
diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench.yaml b/lmms_eval/tasks/olympiadbench/olympiadbench.yaml
new file mode 100644
index 00000000..1580b158
--- /dev/null
+++ b/lmms_eval/tasks/olympiadbench/olympiadbench.yaml
@@ -0,0 +1,6 @@
+group: olympiadbench
+task:
+- olympiadbench_test_en
+- olympiadbench_test_cn
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py b/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py
new file mode 100644
index 00000000..dd40f611
--- /dev/null
+++ b/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py
@@ -0,0 +1,355 @@
+import re
+import sympy as sp
+from sympy import simplify, Eq, sympify, Pow
+from sympy.parsing.latex import parse_latex
+import math
+
+# how to use
+# scorer = OlympiadBenchEvaluator()
+# exp1 = "10^{10^{10^{10}}}"
+# exp2 = "10^{10}"
+# precision = 1e-4
+# res = scorer.judge(exp1, exp2, precision)
+
+class OlympiadBenchEvaluator:
+    def __init__(self):
+        # Map of special symbols to their replacements
+        self.special_signal_map = {
+            "\\left": "",
+            "\\right": "",
+            "∶": ":",
+            "，": ",",
+            "$": "",
+            "\\approx": "=",
+            "\\simeq": "=",
+            "\\sim": "=",
+            "^\\prime": "'",
+            "^{\\prime}": "'",
+            "^\\circ": "",
+            "%": "",
+        }
+        self.pi = parse_latex("\\pi")
+        self.precision = 1e-8  # Default precision for comparison
+
+    def split_by_comma(self, expr: str):
+        # Splits expressions by commas outside of brackets
+        in_bracket_num = 0
+        splitted_expr = []
+        start_idx = 0
+        for i, char in enumerate(expr):
+            if char in ["(", "["]:
+                in_bracket_num += 1
+            elif char in [")", "]"]:
+                in_bracket_num -= 1
+            elif char == "," and in_bracket_num == 0:
+                splitted_expr.append(expr[start_idx:i].strip())
+                start_idx = i + 1
+
+        if start_idx < len(expr):
+            splitted_expr.append(expr[start_idx:].strip())   
+        
+        return splitted_expr
+
+    def trans_plus_minus_sign(self, expr_list: list):
+        # Translates plus-minus signs into separate expressions
+        new_expr_list = []
+        for expr in expr_list:
+            if "\\pm" in expr:
+                new_expr_list.append(expr.replace("\\pm", "+"))
+                new_expr_list.append(expr.replace("\\pm", "-"))
+            else:
+                new_expr_list.append(expr)
+        
+        return new_expr_list
+    
+    def judge(self, expression1, expression2, precision=1e-8):
+        # Judge if two expressions are equal (expression1 is considered as the Ground Truth)
+        # Default precision is a list for supporting multiple expressions
+        precision = precision if isinstance(precision, list) else [precision]
+
+        try:
+            expression1, expression2 = self.preprocess(expression1, expression2)
+        except:
+            return False
+        if expression1 == expression2:
+            # print("Exactly equal")
+            return True
+        
+        # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
+        expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
+        expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
+        
+        expression1 = self.split_by_comma(expression1)
+        expression2 = self.split_by_comma(expression2)
+
+        temp_list1 = self.trans_plus_minus_sign(expression1)
+        temp_list2 = self.trans_plus_minus_sign(expression2)
+
+        # Set up a list for allowed errors
+        if len(precision) <= 1:
+            precision = precision * len(temp_list1)
+        
+        if len(temp_list1) != len(temp_list2):
+            return False
+
+        # Check if elements in both lists can be paired and are equal
+        idx = -1
+        while len(temp_list1) != 0:
+            idx = (idx + 1) % len(temp_list1)
+
+            item1 = temp_list1[idx]
+            self.precision = precision[idx]
+
+            for item2 in temp_list2:
+                if self.is_equal(item1, item2):
+                    temp_list1.remove(item1)
+                    temp_list2.remove(item2)
+                    precision.remove(self.precision)
+                    break
+            else:
+                # If no match was found, return False
+                return False
+
+        # If all elements are matched, return True
+        return True
+    
+    def is_interval(self, expr):
+        # Checks if an expression is an interval
+        return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
+
+    def sympy_sub_pi(self, expression_sympy):
+        # Replaces the symbol for pi in sympy expressions with its numerical value
+        return expression_sympy.subs(self.pi, math.pi)
+    
+    def is_equal(self, expression1, expression2):
+        # Default first expression is ground truth. Check if expressions are equal in different aspects
+        if expression1 == expression2 and expression1 != "" and expression2 != "":
+            # print("Equivalent natively")
+            return True
+
+        # First check if both are intervals
+        if self.is_interval(expression1) and self.is_interval(expression2):
+            try:
+                if self.interval_equal(expression1, expression2):
+                    # print("Interval equivalent")
+                    return True
+            except:
+                return False
+
+        # Then check for numerical equality
+        try:
+            if self.numerical_equal(expression1, expression2):
+                # print("Numerically equivalent")
+                return True
+        except:
+            pass
+        
+        # Then check if expressions are mathematically equal
+        try:
+            if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
+                # print("Expression equivalent")
+                return True
+        except:
+            pass
+            
+        # Lastly, check for equation equality
+        try:
+            if self.equation_equal(expression1, expression2):
+                # print("Equation equivalent")
+                return True
+        except:
+            pass
+            
+        return False
+
+    def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
+        # Check if two numerical values are equal within an allowed error range
+        # Includes possible percentage cases
+        reference = float(expression1)
+        prediction = float(expression2)
+        
+        if include_percentage:
+            gt_result = [reference / 100, reference, reference * 100]
+        else:
+            gt_result = [reference]
+        
+        for item in gt_result:
+            if abs(item - prediction) <= self.precision * 1.01:
+                return True
+        return False
+    
+
+    def expression_equal(self, exp1, exp2):
+        # Check if two expressions are mathematically equivalent
+        # Extract expression and use sympy for equivalence checking
+        def extract_expression(expression):
+            if "=" in expression:
+                expression = expression.split("=")[1]
+            return expression.strip()
+        
+        exp1 = extract_expression(exp1)
+        exp2 = extract_expression(exp2)
+
+        expr1_sym = sympify(parse_latex(exp1))
+        expr2_sym = sympify(parse_latex(exp2))
+
+        if expr1_sym == expr2_sym:
+            return True
+        else:
+            expr1_sym = self.sympy_sub_pi(expr1_sym)
+            expr2_sym = self.sympy_sub_pi(expr2_sym)
+
+            if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
+                return False
+            elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
+                try:
+                    if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
+                        print(f"These two numbers cannot be calculated by the current computer for: \"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
+                        return False
+
+                    if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
+                        return True
+                    else:
+                        return False
+                except:
+                    return False
+            else:
+                try:
+                    simplified_expr = simplify(expr1_sym - expr2_sym)
+
+                    num_value = simplified_expr.evalf()
+                    
+                    return abs(num_value) < 1e-3
+                except:
+                    return False
+
+    def equation_equal(self, expression1, expression2):
+        # Check if two equations are mathematically equivalent
+        # Simplify equations and use sympy for equivalence checking
+        def simplify_equation(latex_eq):
+            lhs, rhs = latex_eq.split('=')
+
+            lhs_expr = parse_latex(lhs)
+            rhs_expr = parse_latex(rhs)
+
+            equation = Eq(lhs_expr, rhs_expr)
+
+            simplified_eq = simplify(equation.lhs - equation.rhs)
+
+            return simplified_eq
+
+        expr1_sym = simplify_equation(expression1)
+        expr2_sym = simplify_equation(expression2)
+
+        division_result_1 = simplify(expr1_sym / expr2_sym)
+        division_result_2 = simplify(expr2_sym / expr1_sym)
+
+        if (division_result_1.is_Integer and division_result_1 != 0) or (division_result_2.is_Integer and division_result_2 != 0):
+            return True
+        else:
+            return False
+
+    def interval_equal(self, expression1, expression2):
+        # Check if two intervals are mathematically equivalent
+        def compare_two_interval(inter1, inter2):
+            if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
+                return False
+            
+            inter1 = inter1.strip('[]()')
+            inter2 = inter2.strip('[]()')
+
+            items_1 = inter1.split(',')
+            items_2 = inter2.split(',')
+
+            for item_1, item_2 in zip(items_1, items_2):
+                if not self.expression_equal(item_1, item_2):
+                    return False
+            return True
+            
+        interval1 = expression1
+        interval2 = expression2
+
+        if interval1 == interval2:
+            return True
+        else:
+            inter_list1 = interval1.split("\\cup")
+            inter_list2 = interval2.split("\\cup")
+            
+            if len(inter_list1) != len(inter_list2):
+                return False
+            else:
+                for inter1, inter2 in zip(inter_list1, inter_list2):
+                    if not compare_two_interval(inter1, inter2):
+                        return False
+                return True
+
+    def preprocess(self, expression1, expression2):
+        # Preprocess expressions to extract and replace special symbols
+        def extract_boxed_content(latex_str):
+            boxed_matches = re.finditer(r'\\boxed{', latex_str)
+            results = ""
+
+            for match in boxed_matches:
+                start_index = match.end()
+                end_index = start_index
+                stack = 1
+
+                while stack > 0 and end_index < len(latex_str):
+                    if latex_str[end_index] == '{':
+                        stack += 1
+                    elif latex_str[end_index] == '}':
+                        stack -= 1
+                    end_index += 1
+
+                if stack == 0:
+                    content = latex_str[start_index:end_index - 1]
+                    results += content + ","
+                else:
+                    raise ValueError("Mismatched braces in LaTeX string.")
+
+            if results == "":
+                last_line_ans = latex_str.strip().split("\n")[-1]
+                dollar_pattern = r"\$(.*?)\$"
+                answers = re.findall(dollar_pattern, last_line_ans)
+
+                if answers:
+                    for ans in answers:
+                        results += ans + ","
+                else:
+                    results = latex_str
+                
+            return results
+        
+        def sepcial_symbol_replace(expression):
+            if "\\in " in expression:
+                expression = expression.split("\\in ")[1]
+            
+            for signal in self.special_signal_map:
+                expression = expression.replace(signal, self.special_signal_map[signal])
+
+            expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~，。")
+
+            pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
+            expression = re.sub(pattern, r'\1', expression)
+
+            return expression
+        
+        exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
+        exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
+
+        return exp1, exp2
+    
+    def can_compute_power(self, expr):
+        # Checks if a power expression can be computed
+        if isinstance(expr, Pow):
+            base, exp = expr.as_base_exp()
+            if base.is_number and exp.is_number:
+                MAX_EXP = 1000  # Adjust based on computing environment
+                if abs(exp.evalf()) > MAX_EXP:
+                    return False
+                else:
+                    return True
+            else:
+                return False
+        else:
+            return True  # Not a power expression, can compute
\ No newline at end of file
diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml b/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml
new file mode 100644
index 00000000..574d0c19
--- /dev/null
+++ b/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml
@@ -0,0 +1,25 @@
+dataset_path: lmms-lab/OlympiadBench
+dataset_kwargs:
+  token: True
+task : "olympiadbench_test_cn"
+test_split: test_cn
+output_type: generate_until
+doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual
+doc_to_text: !function cn_utils.olympiadbench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function cn_utils.olympiadbench_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function cn_utils.olympiadbench_aggregate_results
+    higher_is_better: true
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
\ No newline at end of file
diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml b/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml
new file mode 100644
index 00000000..6d293fb7
--- /dev/null
+++ b/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml
@@ -0,0 +1,25 @@
+dataset_path: lmms-lab/OlympiadBench
+dataset_kwargs:
+  token: True
+task : "olympiadbench_test_en"
+test_split: test_en
+output_type: generate_until
+doc_to_visual: !function en_utils.olympiadbench_doc_to_visual
+doc_to_text: !function en_utils.olympiadbench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function en_utils.olympiadbench_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function en_utils.olympiadbench_aggregate_results
+    higher_is_better: true
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index ba37a96c..c50c4e76 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ dependencies = [
     "openai>=1.0.0",
     "pycocoevalcap",
     "tqdm-multiprocess",
-    "transformers==4.37.2",
+    "transformers",
     "zstandard",
     "pillow",
     "pyyaml",

From c9f759e3dcc588a5e32213378fd5ead9a411cfdb Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Fri, 29 Mar 2024 17:34:34 +0800
Subject: [PATCH 10/38] lint

---
 lmms_eval/api/samplers.py                     |  2 +-
 lmms_eval/tasks/olympiadbench/cn_utils.py     | 18 ++---
 lmms_eval/tasks/olympiadbench/en_utils.py     | 24 +++---
 .../olympiadbench/olympiadbench_evals.py      | 80 +++++++++----------
 4 files changed, 63 insertions(+), 61 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index c2a175ba..a232d0c0 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -19,7 +19,7 @@ def get_dataset(self) -> datasets.Dataset:
             if self.fewshot_indices:
                 self.dataset = self.dataset.select(self.fewshot_indices)
         return self.dataset
-    
+
     def sample(self, n, rnd):
         indices = rnd.sample(range(len(self.get_dataset())), n)
         return self.get_dataset().select(indices)
diff --git a/lmms_eval/tasks/olympiadbench/cn_utils.py b/lmms_eval/tasks/olympiadbench/cn_utils.py
index 34e5ce4d..628d51da 100644
--- a/lmms_eval/tasks/olympiadbench/cn_utils.py
+++ b/lmms_eval/tasks/olympiadbench/cn_utils.py
@@ -5,14 +5,17 @@
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 
 import logging
+
 eval_logger = logging.getLogger("lmms-eval")
 dir_name = os.path.dirname(os.path.abspath(__file__))
 
 olympiadbench_evaluator = OlympiadBenchEvaluator()
 
+
 def olympiadbench_doc_to_visual(doc):
     return [image.convert("RGB") for image in doc["images"]]
 
+
 def olympiadbench_doc_to_text(doc):
     question = doc["question"]
     subject = doc["subfield"]
@@ -36,28 +39,26 @@ def olympiadbench_doc_to_text(doc):
     else:
         post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n'
 
-    final_question = pre_prompt + question + '\n' + post_prompt
+    final_question = pre_prompt + question + "\n" + post_prompt
     return final_question
 
+
 def olympiadbench_process_results(doc, results):
     precision = doc["error"]
-    is_proving = "TP" in doc["source"] 
+    is_proving = "TP" in doc["source"]
     if precision is None:
         precision = 0
     prediction = results[0].strip()
 
     if is_proving:
-        return {
-            "submission": prediction
-        }
+        return {"submission": prediction}
     else:
         prediction = prediction.split("所以最终答案是")[-1]
         prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
         accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
         accuracy = int(accuracy)
-        return {
-            "exact_match": accuracy
-        }
+        return {"exact_match": accuracy}
+
 
 def olympiadbench_aggregate_results(results, args):
     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
@@ -66,4 +67,3 @@ def olympiadbench_aggregate_results(results, args):
     with open(path, "w") as f:
         json.dump(results, f, ensure_ascii=False)
     print(f"Submission file saved to {path}")
-    
\ No newline at end of file
diff --git a/lmms_eval/tasks/olympiadbench/en_utils.py b/lmms_eval/tasks/olympiadbench/en_utils.py
index a21ee159..4b165e38 100644
--- a/lmms_eval/tasks/olympiadbench/en_utils.py
+++ b/lmms_eval/tasks/olympiadbench/en_utils.py
@@ -5,14 +5,17 @@
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 
 import logging
+
 eval_logger = logging.getLogger("lmms-eval")
 dir_name = os.path.dirname(os.path.abspath(__file__))
 
 olympiadbench_evaluator = OlympiadBenchEvaluator()
 
+
 def olympiadbench_doc_to_visual(doc):
     return [image.convert("RGB") for image in doc["images"]]
 
+
 def olympiadbench_doc_to_text(doc):
     question = doc["question"]
     subject = doc["subfield"]
@@ -30,34 +33,34 @@ def olympiadbench_doc_to_text(doc):
         post_prompt += f"The answer of the question should be {ans_type}.\n"
     else:
         post_prompt += f"The question has multiple answers, each of them should be {ans_type}.\n"
-    post_prompt += "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "
+    post_prompt += (
+        "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "
+    )
     if not mul_ans:
         post_prompt += '"So the final answer is \\boxed{answer}."\n'
     else:
-        post_prompt += 'So the final answer is \\boxed{multiple answers connected with commas}.\n'
+        post_prompt += "So the final answer is \\boxed{multiple answers connected with commas}.\n"
 
-    final_question = pre_prompt + question + '\n' + post_prompt
+    final_question = pre_prompt + question + "\n" + post_prompt
     return final_question
 
+
 def olympiadbench_process_results(doc, results):
     precision = doc["error"]
-    is_proving = "TP" in doc["source"] 
+    is_proving = "TP" in doc["source"]
     if precision is None:
         precision = 0
     prediction = results[0].strip()
 
     if is_proving:
-        return {
-            "submission": prediction
-        }
+        return {"submission": prediction}
     else:
         prediction = prediction.split("final answer is")[-1]
         prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
         accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
         accuracy = int(accuracy)
-        return {
-            "exact_match": accuracy
-        }
+        return {"exact_match": accuracy}
+
 
 def olympiadbench_aggregate_results(results, args):
     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
@@ -66,4 +69,3 @@ def olympiadbench_aggregate_results(results, args):
     with open(path, "w") as f:
         json.dump(results, f, ensure_ascii=False)
     print(f"Submission file saved to {path}")
-    
\ No newline at end of file
diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py b/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py
index dd40f611..5ae36883 100644
--- a/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py
+++ b/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py
@@ -11,6 +11,7 @@
 # precision = 1e-4
 # res = scorer.judge(exp1, exp2, precision)
 
+
 class OlympiadBenchEvaluator:
     def __init__(self):
         # Map of special symbols to their replacements
@@ -46,8 +47,8 @@ def split_by_comma(self, expr: str):
                 start_idx = i + 1
 
         if start_idx < len(expr):
-            splitted_expr.append(expr[start_idx:].strip())   
-        
+            splitted_expr.append(expr[start_idx:].strip())
+
         return splitted_expr
 
     def trans_plus_minus_sign(self, expr_list: list):
@@ -59,9 +60,9 @@ def trans_plus_minus_sign(self, expr_list: list):
                 new_expr_list.append(expr.replace("\\pm", "-"))
             else:
                 new_expr_list.append(expr)
-        
+
         return new_expr_list
-    
+
     def judge(self, expression1, expression2, precision=1e-8):
         # Judge if two expressions are equal (expression1 is considered as the Ground Truth)
         # Default precision is a list for supporting multiple expressions
@@ -74,11 +75,11 @@ def judge(self, expression1, expression2, precision=1e-8):
         if expression1 == expression2:
             # print("Exactly equal")
             return True
-        
+
         # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
-        expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
-        expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
-        
+        expression1 = re.sub(r"[\u4e00-\u9fff]+", "", expression1)
+        expression2 = re.sub(r"[\u4e00-\u9fff]+", "", expression2)
+
         expression1 = self.split_by_comma(expression1)
         expression2 = self.split_by_comma(expression2)
 
@@ -88,7 +89,7 @@ def judge(self, expression1, expression2, precision=1e-8):
         # Set up a list for allowed errors
         if len(precision) <= 1:
             precision = precision * len(temp_list1)
-        
+
         if len(temp_list1) != len(temp_list2):
             return False
 
@@ -112,7 +113,7 @@ def judge(self, expression1, expression2, precision=1e-8):
 
         # If all elements are matched, return True
         return True
-    
+
     def is_interval(self, expr):
         # Checks if an expression is an interval
         return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
@@ -120,7 +121,7 @@ def is_interval(self, expr):
     def sympy_sub_pi(self, expression_sympy):
         # Replaces the symbol for pi in sympy expressions with its numerical value
         return expression_sympy.subs(self.pi, math.pi)
-    
+
     def is_equal(self, expression1, expression2):
         # Default first expression is ground truth. Check if expressions are equal in different aspects
         if expression1 == expression2 and expression1 != "" and expression2 != "":
@@ -143,7 +144,7 @@ def is_equal(self, expression1, expression2):
                 return True
         except:
             pass
-        
+
         # Then check if expressions are mathematically equal
         try:
             if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
@@ -151,7 +152,7 @@ def is_equal(self, expression1, expression2):
                 return True
         except:
             pass
-            
+
         # Lastly, check for equation equality
         try:
             if self.equation_equal(expression1, expression2):
@@ -159,7 +160,7 @@ def is_equal(self, expression1, expression2):
                 return True
         except:
             pass
-            
+
         return False
 
     def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
@@ -167,17 +168,16 @@ def numerical_equal(self, expression1: str, expression2: str, include_percentage
         # Includes possible percentage cases
         reference = float(expression1)
         prediction = float(expression2)
-        
+
         if include_percentage:
             gt_result = [reference / 100, reference, reference * 100]
         else:
             gt_result = [reference]
-        
+
         for item in gt_result:
             if abs(item - prediction) <= self.precision * 1.01:
                 return True
         return False
-    
 
     def expression_equal(self, exp1, exp2):
         # Check if two expressions are mathematically equivalent
@@ -186,7 +186,7 @@ def extract_expression(expression):
             if "=" in expression:
                 expression = expression.split("=")[1]
             return expression.strip()
-        
+
         exp1 = extract_expression(exp1)
         exp2 = extract_expression(exp2)
 
@@ -204,7 +204,7 @@ def extract_expression(expression):
             elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
                 try:
                     if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
-                        print(f"These two numbers cannot be calculated by the current computer for: \"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
+                        print(f'These two numbers cannot be calculated by the current computer for: "{str(expr1_sym)}" and "{str(expr2_sym)}"')
                         return False
 
                     if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
@@ -218,7 +218,7 @@ def extract_expression(expression):
                     simplified_expr = simplify(expr1_sym - expr2_sym)
 
                     num_value = simplified_expr.evalf()
-                    
+
                     return abs(num_value) < 1e-3
                 except:
                     return False
@@ -227,7 +227,7 @@ def equation_equal(self, expression1, expression2):
         # Check if two equations are mathematically equivalent
         # Simplify equations and use sympy for equivalence checking
         def simplify_equation(latex_eq):
-            lhs, rhs = latex_eq.split('=')
+            lhs, rhs = latex_eq.split("=")
 
             lhs_expr = parse_latex(lhs)
             rhs_expr = parse_latex(rhs)
@@ -254,18 +254,18 @@ def interval_equal(self, expression1, expression2):
         def compare_two_interval(inter1, inter2):
             if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
                 return False
-            
-            inter1 = inter1.strip('[]()')
-            inter2 = inter2.strip('[]()')
 
-            items_1 = inter1.split(',')
-            items_2 = inter2.split(',')
+            inter1 = inter1.strip("[]()")
+            inter2 = inter2.strip("[]()")
+
+            items_1 = inter1.split(",")
+            items_2 = inter2.split(",")
 
             for item_1, item_2 in zip(items_1, items_2):
                 if not self.expression_equal(item_1, item_2):
                     return False
             return True
-            
+
         interval1 = expression1
         interval2 = expression2
 
@@ -274,7 +274,7 @@ def compare_two_interval(inter1, inter2):
         else:
             inter_list1 = interval1.split("\\cup")
             inter_list2 = interval2.split("\\cup")
-            
+
             if len(inter_list1) != len(inter_list2):
                 return False
             else:
@@ -286,7 +286,7 @@ def compare_two_interval(inter1, inter2):
     def preprocess(self, expression1, expression2):
         # Preprocess expressions to extract and replace special symbols
         def extract_boxed_content(latex_str):
-            boxed_matches = re.finditer(r'\\boxed{', latex_str)
+            boxed_matches = re.finditer(r"\\boxed{", latex_str)
             results = ""
 
             for match in boxed_matches:
@@ -295,14 +295,14 @@ def extract_boxed_content(latex_str):
                 stack = 1
 
                 while stack > 0 and end_index < len(latex_str):
-                    if latex_str[end_index] == '{':
+                    if latex_str[end_index] == "{":
                         stack += 1
-                    elif latex_str[end_index] == '}':
+                    elif latex_str[end_index] == "}":
                         stack -= 1
                     end_index += 1
 
                 if stack == 0:
-                    content = latex_str[start_index:end_index - 1]
+                    content = latex_str[start_index : end_index - 1]
                     results += content + ","
                 else:
                     raise ValueError("Mismatched braces in LaTeX string.")
@@ -317,28 +317,28 @@ def extract_boxed_content(latex_str):
                         results += ans + ","
                 else:
                     results = latex_str
-                
+
             return results
-        
+
         def sepcial_symbol_replace(expression):
             if "\\in " in expression:
                 expression = expression.split("\\in ")[1]
-            
+
             for signal in self.special_signal_map:
                 expression = expression.replace(signal, self.special_signal_map[signal])
 
             expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~，。")
 
-            pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
-            expression = re.sub(pattern, r'\1', expression)
+            pattern = r"\\(?:mathrm|mathbf)\{~?([^}]*)\}"
+            expression = re.sub(pattern, r"\1", expression)
 
             return expression
-        
+
         exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
         exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
 
         return exp1, exp2
-    
+
     def can_compute_power(self, expr):
         # Checks if a power expression can be computed
         if isinstance(expr, Pow):
@@ -352,4 +352,4 @@ def can_compute_power(self, expr):
             else:
                 return False
         else:
-            return True  # Not a power expression, can compute
\ No newline at end of file
+            return True  # Not a power expression, can compute

From 9979b0397ef6370576c405e3ae7664cc0055b58f Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sat, 30 Mar 2024 23:27:45 +0800
Subject: [PATCH 11/38] update context sampler

---
 lmms_eval/api/instance.py |  2 +-
 lmms_eval/api/samplers.py | 91 ++++++++++++++++++++++++++++++---------
 lmms_eval/models/llava.py |  8 ++++
 pyproject.toml            |  1 +
 4 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/lmms_eval/api/instance.py b/lmms_eval/api/instance.py
index 41875358..67f00fde 100644
--- a/lmms_eval/api/instance.py
+++ b/lmms_eval/api/instance.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Literal, Tuple
+from typing import Literal, Tuple, Iterable, Callable
 
 
 @dataclass
diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index a232d0c0..9278f9f1 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -1,4 +1,65 @@
 import datasets
+from typing import Callable
+
+
+class LazyLoadedImages(object):
+    def __init__(self, data_frame, index):
+        self.data_frame: datasets.Dataset = data_frame
+        self.index = index
+
+    def get_images(self, doc_to_visual):
+        return doc_to_visual(self.data_frame[self.index])
+
+
+class Context(object):
+    def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n"):
+        self.task = task
+        self.config = task._config
+
+        self.doc_to_visual = self.task.doc_to_visual
+        self.doc_to_text = self.task.doc_to_text
+        self.doc_to_target = self.task.doc_to_target
+        self.doc_to_choice = self.task.doc_to_choice
+
+        self.target_delimiter = target_delimiter
+        self.few_shot_delimiter = few_shot_delimiter
+
+        self.contexts = []
+
+    def get_question(self, doc, model_specific_prompt_kwargs=None):
+        return self.doc_to_text(doc, model_specific_prompt_kwargs) if (self.doc_to_choice is None or type(self.doc_to_text(doc)) is str) else self.doc_to_choice(doc)[self.doc_to_text(doc)]
+
+    def get_target(self, doc):
+        return (
+            str(self.doc_to_target(doc)[0])
+            if type(self.doc_to_target(doc)) is list
+            else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
+        )
+
+    def add_in_context_example(self, doc, model_specific_prompt_kwargs=None, data_frame=None, index=None):
+        question = self.get_question(doc, model_specific_prompt_kwargs)
+        if data_frame and index:
+            visual = LazyLoadedImages(data_frame, index)
+        else:
+            visual = None
+        target = self.doc_to_target(doc)
+        if visual:
+            self.contexts.append(visual)
+        self.contexts.append(question)
+        self.contexts.append(self.target_delimiter)
+        self.contexts.append(target)
+        self.contexts.append(self.few_shot_delimiter)
+
+    def add_question(self, doc, model_specific_prompt_kwargs=None, data_frame=None, index=None):
+        question = self.doc_to_text(doc, model_specific_prompt_kwargs)
+        if data_frame and index:
+            visual = LazyLoadedImages(data_frame, index)
+        else:
+            visual = None
+        if visual:
+            self.contexts.append(visual)
+        self.contexts.append(question)
+        self.contexts.append(self.target_delimiter)
 
 
 class FewShotDataset(object):
@@ -22,7 +83,7 @@ def get_dataset(self) -> datasets.Dataset:
 
     def sample(self, n, rnd):
         indices = rnd.sample(range(len(self.get_dataset())), n)
-        return self.get_dataset().select(indices)
+        return indices, self.get_dataset().select(indices)
 
     def __getitem__(self, item):
         return self.get_dataset()[item]
@@ -47,33 +108,21 @@ def __init__(self, docs: FewShotDataset, task, fewshot_indices=None, rnd=None) -
         if fewshot_indices:  # subset few-shot docs from
             self.docs.fewshot_indices = fewshot_indices
 
-    def get_context(self, doc, num_fewshot):
+    def get_context(self, doc, num_fewshot, model_specific_prompt_kwargs=None):
         # draw an extra fewshot sample if using same split as evaluating on
         n_samples = num_fewshot + 1 if self.docs.same_as_eval else num_fewshot
 
         # draw `n_samples` docs from fewshot_docs
-        fewshotex = self.sample(n_samples)
+        indices, fewshotex = self.sample(n_samples)
 
         # get rid of the doc that's the one we're evaluating, if it's in the fewshot
         # TODO: should we just stop people from using fewshot from same split as evaluating?
-        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
-
-        labeled_examples = (
-            self.fewshot_delimiter.join(
-                [
-                    # TODO: is separating doc_to_text and doc_to_target by one space always desired?
-                    (self.doc_to_text(doc) if (self.config.doc_to_choice is None or type(self.doc_to_text(doc)) is str) else self.doc_to_choice(doc)[self.doc_to_text(doc)])
-                    + self.target_delimiter
-                    + (
-                        str(self.doc_to_target(doc)[0])
-                        if type(self.doc_to_target(doc)) is list
-                        else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
-                    )
-                    for doc in selected_docs
-                ]
-            )
-            + self.fewshot_delimiter
-        )
+        selected_docs = [(idx, x) for idx, x in zip(indices, fewshotex) if x != doc][:num_fewshot]
+
+        labeled_examples = Context(self.task, self.fewshot_delimiter, self.target_delimiter)
+
+        for idx, doc in selected_docs:
+            labeled_examples.add_in_context_example(doc, model_specific_prompt_kwargs, self.docs, idx)
 
         return labeled_examples
 
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index b3cb8a66..b93be935 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -272,6 +272,14 @@ def _collate(x):
             split = split[0]
             visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]
             visuals = self.flatten(visuals)
+            ############### for debugging ###################
+            # TODO: remove this block
+            if len(visuals) > 1:
+                for i in range(len(visuals)):
+                    path = f"./logs/llava/{i}.png"
+                    visuals[i].save(path)
+                pass
+            #################################################
             # we assume all gen kwargs in the batch are the same
             # this is safe to assume because the `grouper` object ensures it.
             gen_kwargs = all_gen_kwargs[0]
diff --git a/pyproject.toml b/pyproject.toml
index c50c4e76..761575da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ dependencies = [
     "tiktoken",
     "pre-commit",
     "pydantic",
+    "antlr4-python3-runtime==4.11",
 ]
 
 [tool.setuptools.packages.find]

From 8d5685636ff84e29f0c1d3d6273f404b7206d0ec Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 00:15:47 +0800
Subject: [PATCH 12/38] Refactor get_question method in Context class

---
 lmms_eval/api/samplers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index 9278f9f1..f1c4fcf0 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -27,7 +27,8 @@ def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str
         self.contexts = []
 
     def get_question(self, doc, model_specific_prompt_kwargs=None):
-        return self.doc_to_text(doc, model_specific_prompt_kwargs) if (self.doc_to_choice is None or type(self.doc_to_text(doc)) is str) else self.doc_to_choice(doc)[self.doc_to_text(doc)]
+        text = self.doc_to_text(doc, model_specific_prompt_kwargs)
+        return text if (self.doc_to_choice is None or isinstance(text, str)) else self.doc_to_choice(doc)[text]
 
     def get_target(self, doc):
         return (

From b677bf3d92cc8976045d301991be96b52a1b8bfe Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 00:21:52 +0800
Subject: [PATCH 13/38] fix

---
 lmms_eval/api/samplers.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index f1c4fcf0..41418380 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -26,8 +26,8 @@ def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str
 
         self.contexts = []
 
-    def get_question(self, doc, model_specific_prompt_kwargs=None):
-        text = self.doc_to_text(doc, model_specific_prompt_kwargs)
+    def get_question(self, doc):
+        text = self.doc_to_text(doc)
         return text if (self.doc_to_choice is None or isinstance(text, str)) else self.doc_to_choice(doc)[text]
 
     def get_target(self, doc):
@@ -37,8 +37,8 @@ def get_target(self, doc):
             else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
         )
 
-    def add_in_context_example(self, doc, model_specific_prompt_kwargs=None, data_frame=None, index=None):
-        question = self.get_question(doc, model_specific_prompt_kwargs)
+    def add_in_context_example(self, doc, data_frame=None, index=None):
+        question = self.get_question(doc)
         if data_frame and index:
             visual = LazyLoadedImages(data_frame, index)
         else:
@@ -51,8 +51,8 @@ def add_in_context_example(self, doc, model_specific_prompt_kwargs=None, data_fr
         self.contexts.append(target)
         self.contexts.append(self.few_shot_delimiter)
 
-    def add_question(self, doc, model_specific_prompt_kwargs=None, data_frame=None, index=None):
-        question = self.doc_to_text(doc, model_specific_prompt_kwargs)
+    def add_question(self, doc, data_frame=None, index=None):
+        question = self.get_question(doc)
         if data_frame and index:
             visual = LazyLoadedImages(data_frame, index)
         else:

From a2e6e0e1285f90276237c02087967b4b2406999e Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 00:22:52 +0800
Subject: [PATCH 14/38] Refactor get_context method in ContextSampler class

---
 lmms_eval/api/samplers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index 41418380..f25a2161 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -109,7 +109,7 @@ def __init__(self, docs: FewShotDataset, task, fewshot_indices=None, rnd=None) -
         if fewshot_indices:  # subset few-shot docs from
             self.docs.fewshot_indices = fewshot_indices
 
-    def get_context(self, doc, num_fewshot, model_specific_prompt_kwargs=None):
+    def get_context(self, doc, num_fewshot):
         # draw an extra fewshot sample if using same split as evaluating on
         n_samples = num_fewshot + 1 if self.docs.same_as_eval else num_fewshot
 
@@ -123,7 +123,7 @@ def get_context(self, doc, num_fewshot, model_specific_prompt_kwargs=None):
         labeled_examples = Context(self.task, self.fewshot_delimiter, self.target_delimiter)
 
         for idx, doc in selected_docs:
-            labeled_examples.add_in_context_example(doc, model_specific_prompt_kwargs, self.docs, idx)
+            labeled_examples.add_in_context_example(doc, self.docs, idx)
 
         return labeled_examples
 

From f6fc367689f508af9db2e4f697d3d20957fa2e11 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 00:32:22 +0800
Subject: [PATCH 15/38] Add description to Context class and update
 ConfigurableTask to include the description in labeled examples

---
 lmms_eval/api/samplers.py | 10 ++++++++--
 lmms_eval/api/task.py     | 12 ++++++------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index f25a2161..43f95319 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -12,7 +12,7 @@ def get_images(self, doc_to_visual):
 
 
 class Context(object):
-    def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n"):
+    def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n", description = None):
         self.task = task
         self.config = task._config
 
@@ -25,6 +25,12 @@ def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str
         self.few_shot_delimiter = few_shot_delimiter
 
         self.contexts = []
+        
+        if description:
+            self.add_description(description)
+    
+    def add_description(self, description):
+        self.contexts = [description] + self.contexts
 
     def get_question(self, doc):
         text = self.doc_to_text(doc)
@@ -109,7 +115,7 @@ def __init__(self, docs: FewShotDataset, task, fewshot_indices=None, rnd=None) -
         if fewshot_indices:  # subset few-shot docs from
             self.docs.fewshot_indices = fewshot_indices
 
-    def get_context(self, doc, num_fewshot):
+    def get_context(self, doc, num_fewshot) -> Context:
         # draw an extra fewshot sample if using same split as evaluating on
         n_samples = num_fewshot + 1 if self.docs.same_as_eval else num_fewshot
 
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index abd36470..b365673a 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -788,20 +788,20 @@ def fewshot_context(self, doc_id, num_fewshot, split):
         doc = self.dataset_no_image[split][doc_id]
         if num_fewshot == 0:
             # always prepend the (possibly empty) task description
-            labeled_examples = self.config.description
+            labeled_examples = [self.config.description]
         else:
-            labeled_examples = self.config.description + self.sampler.get_context(doc, num_fewshot)
+            labeled_examples = [self.config.description] + self.sampler.get_context(doc, num_fewshot).contexts
         example = self.doc_to_text(doc)
         if type(example) == str:
-            return labeled_examples + example
+            return labeled_examples + [example]
         elif type(example) == list:
-            return [labeled_examples + ex for ex in example]
+            return labeled_examples + [ex for ex in example]
         elif type(example) == int:
             if self.config.doc_to_choice is not None:
                 choices = self.doc_to_choice(doc)
-                return labeled_examples + choices[example]
+                return labeled_examples + [choices[example]]
             else:
-                return labeled_examples + str(example)
+                return labeled_examples + [str(example)]
 
     def apply_filters(self):
         if hasattr(self, "_filters"):

From 826e5fec5fec625d5ce811c5008647da460c4f64 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 00:33:36 +0800
Subject: [PATCH 16/38] lint

---
 lmms_eval/api/samplers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index 43f95319..1940c158 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -12,7 +12,7 @@ def get_images(self, doc_to_visual):
 
 
 class Context(object):
-    def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n", description = None):
+    def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n", description=None):
         self.task = task
         self.config = task._config
 
@@ -25,10 +25,10 @@ def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str
         self.few_shot_delimiter = few_shot_delimiter
 
         self.contexts = []
-        
+
         if description:
             self.add_description(description)
-    
+
     def add_description(self, description):
         self.contexts = [description] + self.contexts
 

From e4cb4e67ce3e1d867b3cecd1e807195aff2d440d Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 14:53:48 +0800
Subject: [PATCH 17/38] fix visuals

---
 lmms_eval/models/llava.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index b93be935..3bb0b30b 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -270,15 +270,15 @@ def _collate(x):
             contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
             task = task[0]
             split = split[0]
-            visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]
-            visuals = self.flatten(visuals)
+            batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] # [B, N]
+            flattened_visuals = self.flatten(batched_visuals) # [B*N]
             ############### for debugging ###################
             # TODO: remove this block
-            if len(visuals) > 1:
-                for i in range(len(visuals)):
-                    path = f"./logs/llava/{i}.png"
-                    visuals[i].save(path)
-                pass
+            # if len(visuals) > 1:
+            #     for i in range(len(visuals)):
+            #         path = f"./logs/llava/{i}.png"
+            #         visuals[i].save(path)
+            #     pass
             #################################################
             # we assume all gen kwargs in the batch are the same
             # this is safe to assume because the `grouper` object ensures it.
@@ -300,8 +300,8 @@ def _collate(x):
                 self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio")
                 eval_logger.info(f"Setting image aspect ratio: {self._config.image_aspect_ratio}")
             # encode, pad, and truncate contexts for this batch
-            if visuals:
-                image_tensor = process_images(visuals, self._image_processor, self._config)
+            if flattened_visuals:
+                image_tensor = process_images(flattened_visuals, self._image_processor, self._config)
                 if type(image_tensor) is list:
                     image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor]
                 else:
@@ -313,7 +313,7 @@ def _collate(x):
 
             question_input = []
 
-            for visual, context in zip(visuals, contexts):
+            for visual, context in zip(batched_visuals, contexts):
                 if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context:
                     """
                     Three senarios:
@@ -323,6 +323,8 @@ def _collate(x):
                     """
                     image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN]
                     image_tokens = " ".join(image_tokens)
+                    if isinstance(context, list):
+                        context = "".join(context)
                     question = image_tokens + "\n" + context
                 else:
                     question = context
@@ -336,7 +338,7 @@ def _collate(x):
             # The above for loop has bugs. When there is no visuals, e.g. pure text,
             # there will be no for loop execute resulting in an empty question_input (because no visuals)
             # Scenario 1 won't even be execute
-            if len(visuals) == 0:
+            if len(flattened_visuals) == 0:
                 for context in contexts:
                     question = context
                     conv = conv_templates[self.conv_template].copy()
@@ -347,7 +349,7 @@ def _collate(x):
 
             # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
             # preconfigure gen_kwargs with defaults
-            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+            gen_kwargs["image_sizes"] = [flattened_visuals[idx].size for idx in range(len(flattened_visuals))
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
             if "temperature" not in gen_kwargs:

From 0e26684691703baff229609019d3044dc892ef65 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 14:56:32 +0800
Subject: [PATCH 18/38] lint

---
 lmms_eval/models/llava.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 3bb0b30b..3a9c8407 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -270,8 +270,8 @@ def _collate(x):
             contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
             task = task[0]
             split = split[0]
-            batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] # [B, N]
-            flattened_visuals = self.flatten(batched_visuals) # [B*N]
+            batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]  # [B, N]
+            flattened_visuals = self.flatten(batched_visuals)  # [B*N]
             ############### for debugging ###################
             # TODO: remove this block
             # if len(visuals) > 1:
@@ -349,7 +349,7 @@ def _collate(x):
 
             # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
             # preconfigure gen_kwargs with defaults
-            gen_kwargs["image_sizes"] = [flattened_visuals[idx].size for idx in range(len(flattened_visuals))
+            gen_kwargs["image_sizes"] = [flattened_visuals[idx].size for idx in range(len(flattened_visuals))]
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
             if "temperature" not in gen_kwargs:

From b202dad57430f8d25bc84612ba62f58db61bac5e Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 15:13:06 +0800
Subject: [PATCH 19/38] textvqa doc_to_target

---
 lmms_eval/tasks/textvqa/_default_template_textvqa_yaml | 2 +-
 lmms_eval/tasks/textvqa/utils.py                       | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml b/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml
index fd485002..5c6205e9 100644
--- a/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml
+++ b/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml
@@ -2,7 +2,7 @@ dataset_path: lmms-lab/textvqa
 output_type: generate_until
 doc_to_visual: !function utils.textvqa_doc_to_visual
 doc_to_text: !function utils.textvqa_doc_to_text
-doc_to_target: "answer"
+doc_to_target: !function utils.textvqa_doc_to_target
 generation_kwargs:
   until:
     - "ASSISTANT:"
diff --git a/lmms_eval/tasks/textvqa/utils.py b/lmms_eval/tasks/textvqa/utils.py
index ea3b503b..dd87625d 100644
--- a/lmms_eval/tasks/textvqa/utils.py
+++ b/lmms_eval/tasks/textvqa/utils.py
@@ -1,5 +1,6 @@
 import re
 import os
+import random
 import json
 import yaml
 import pathlib
@@ -66,3 +67,7 @@ def textvqa_aggreate_submissions(results, args):
         json.dump(results, f)
     # print(f"Submission file saved to {path}")
     eval_logger.info(f"Submission file saved to {path}")
+
+def textvqa_doc_to_target(doc):
+    answers = doc["answers"]
+    return random.choice(answers) if isinstance(answers, list) else answers

From c2e3a03e2f973948b8677e5eaf97836653c9fd8a Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 16:40:45 +0800
Subject: [PATCH 20/38] Update lmms_eval/api/samplers.py and
 lmms_eval/api/task.py

---
 lmms_eval/api/samplers.py        | 25 ++++++++++++++++++++++++-
 lmms_eval/api/task.py            | 10 +++++-----
 lmms_eval/tasks/textvqa/utils.py |  1 +
 lmms_eval/utils.py               |  4 +++-
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index 1940c158..e24ae126 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -1,5 +1,5 @@
 import datasets
-from typing import Callable
+from typing import Callable, Iterable, Optional
 
 
 class LazyLoadedImages(object):
@@ -68,6 +68,29 @@ def add_question(self, doc, data_frame=None, index=None):
         self.contexts.append(question)
         self.contexts.append(self.target_delimiter)
 
+    def get_text(self, image_tokens="<image>"):
+        texts = []
+        for context in self.contexts:
+            if isinstance(context, LazyLoadedImages):
+                if isinstance(image_tokens, str):
+                    texts.append(image_tokens)
+                else:
+                    texts.append(image_tokens(context))
+            else:
+                texts.append(context)
+        return "".join(texts)
+
+    def get_visions(self):
+        return [context.get_images() for context in self.contexts if isinstance(context, LazyLoadedImages)]
+
+    def extend(self, context):
+        if isinstance(context, list):
+            self.contexts.extend(context)
+        elif isinstance(context, Context):
+            self.contexts.extend(context.contexts)
+        else:
+            raise ValueError(f"Cannot extend context with object of type {type(context)}")
+
 
 class FewShotDataset(object):
     def __init__(self, dataset=None, *, dataset_path: str = None, dataset_name: str = None, split: str = None, dataset_kwargs: dict = None, same_as_eval: bool = False):
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index b365673a..a467a9bc 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -785,12 +785,12 @@ def fewshot_context(self, doc_id, num_fewshot, split):
         :returns: str
             The fewshot context.
         """
+        from lmms_eval.api.samplers import Context
+
         doc = self.dataset_no_image[split][doc_id]
-        if num_fewshot == 0:
-            # always prepend the (possibly empty) task description
-            labeled_examples = [self.config.description]
-        else:
-            labeled_examples = [self.config.description] + self.sampler.get_context(doc, num_fewshot).contexts
+        labeled_examples = Context(self, self.config.fewshot_delimiter, self.config.target_delimiter, self.config.description)
+        if num_fewshot != 0:
+            labeled_examples.extend(self.sampler.get_context(doc, num_fewshot))
         example = self.doc_to_text(doc)
         if type(example) == str:
             return labeled_examples + [example]
diff --git a/lmms_eval/tasks/textvqa/utils.py b/lmms_eval/tasks/textvqa/utils.py
index dd87625d..7a1ef194 100644
--- a/lmms_eval/tasks/textvqa/utils.py
+++ b/lmms_eval/tasks/textvqa/utils.py
@@ -68,6 +68,7 @@ def textvqa_aggreate_submissions(results, args):
     # print(f"Submission file saved to {path}")
     eval_logger.info(f"Submission file saved to {path}")
 
+
 def textvqa_doc_to_target(doc):
     answers = doc["answers"]
     return random.choice(answers) if isinstance(answers, list) else answers
diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py
index 649241a5..047d8ad4 100644
--- a/lmms_eval/utils.py
+++ b/lmms_eval/utils.py
@@ -820,7 +820,9 @@ def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
         Yields:
         List: Yields reordered elements one by one.
         """
-        arr = sorted(arr, key=lambda x: self.fn(x[1]))
+        from lmms_eval.api.samplers import Context
+
+        arr = sorted(arr, key=lambda x: self.fn(x[1].get_text() if isinstance(x, Context) else x[1]))
         self.reorder_indices.extend([x[0] for x in arr])
         yield from [x[1] for x in arr]
 

From 83b347748f6c1cfffc34d78de0bd5f44a21df109 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 16:44:16 +0800
Subject: [PATCH 21/38] Add append method to Context class and refactor example
 handling in ConfigurableTask

---
 lmms_eval/api/samplers.py |  3 +++
 lmms_eval/api/task.py     | 13 ++-----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index e24ae126..f7c694bd 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -90,6 +90,9 @@ def extend(self, context):
             self.contexts.extend(context.contexts)
         else:
             raise ValueError(f"Cannot extend context with object of type {type(context)}")
+    
+    def append(self, context):
+        self.contexts.append(context)
 
 
 class FewShotDataset(object):
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index a467a9bc..cc359962 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -791,17 +791,8 @@ def fewshot_context(self, doc_id, num_fewshot, split):
         labeled_examples = Context(self, self.config.fewshot_delimiter, self.config.target_delimiter, self.config.description)
         if num_fewshot != 0:
             labeled_examples.extend(self.sampler.get_context(doc, num_fewshot))
-        example = self.doc_to_text(doc)
-        if type(example) == str:
-            return labeled_examples + [example]
-        elif type(example) == list:
-            return labeled_examples + [ex for ex in example]
-        elif type(example) == int:
-            if self.config.doc_to_choice is not None:
-                choices = self.doc_to_choice(doc)
-                return labeled_examples + [choices[example]]
-            else:
-                return labeled_examples + [str(example)]
+        labeled_examples.add_question(doc)
+        return labeled_examples
 
     def apply_filters(self):
         if hasattr(self, "_filters"):

From 961158ed40d2e3a749052d0d4f85ce8aed8a43f0 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 17:00:40 +0800
Subject: [PATCH 22/38] Refactor sorting logic in lmms_eval/utils.py

---
 lmms_eval/api/samplers.py | 7 ++++++-
 lmms_eval/utils.py        | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index f7c694bd..e710fed7 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -90,10 +90,15 @@ def extend(self, context):
             self.contexts.extend(context.contexts)
         else:
             raise ValueError(f"Cannot extend context with object of type {type(context)}")
-    
+
     def append(self, context):
         self.contexts.append(context)
 
+    def __lt__(self, other):
+        if not isinstance(other, Context):
+            return NotImplemented
+        return self.get_text() < other.get_text()
+
 
 class FewShotDataset(object):
     def __init__(self, dataset=None, *, dataset_path: str = None, dataset_name: str = None, split: str = None, dataset_kwargs: dict = None, same_as_eval: bool = False):
diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py
index 047d8ad4..45d71d9e 100644
--- a/lmms_eval/utils.py
+++ b/lmms_eval/utils.py
@@ -822,7 +822,7 @@ def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
         """
         from lmms_eval.api.samplers import Context
 
-        arr = sorted(arr, key=lambda x: self.fn(x[1].get_text() if isinstance(x, Context) else x[1]))
+        arr = sorted(arr, key=lambda x: self.fn(x[1]))
         self.reorder_indices.extend([x[0] for x in arr])
         yield from [x[1] for x in arr]
 

From d8dcd0a29bef5346bc68f3918bad55b35994255e Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 18:13:36 +0800
Subject: [PATCH 23/38] Refactor get_text method in Context class

---
 lmms_eval/api/samplers.py | 20 ++++++++++++++++----
 lmms_eval/models/llava.py |  9 ++++++---
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index e710fed7..c6c519e5 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -68,20 +68,29 @@ def add_question(self, doc, data_frame=None, index=None):
         self.contexts.append(question)
         self.contexts.append(self.target_delimiter)
 
-    def get_text(self, image_tokens="<image>"):
+    def get_text(self, *, image_tokens="<image>", lazy=True):
         texts = []
+        vision = []
         for context in self.contexts:
             if isinstance(context, LazyLoadedImages):
                 if isinstance(image_tokens, str):
-                    texts.append(image_tokens)
+                    if lazy:
+                        texts.append(image_tokens)
+                    else:
+                        now_vision = context.get_images(self.doc_to_visual)
+                        vision.extend(now_vision)
+                        texts.append(image_tokens * len(now_vision))
                 else:
                     texts.append(image_tokens(context))
             else:
                 texts.append(context)
-        return "".join(texts)
+        if lazy:
+            return "".join(texts)
+        else:
+            return "".join(texts), vision
 
     def get_visions(self):
-        return [context.get_images() for context in self.contexts if isinstance(context, LazyLoadedImages)]
+        return [context.get_images(self.doc_to_visual) for context in self.contexts if isinstance(context, LazyLoadedImages)]
 
     def extend(self, context):
         if isinstance(context, list):
@@ -94,6 +103,9 @@ def extend(self, context):
     def append(self, context):
         self.contexts.append(context)
 
+    def __str__(self):
+        return self.get_text()
+
     def __lt__(self, other):
         if not isinstance(other, Context):
             return NotImplemented
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 3a9c8407..79ca0853 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -256,7 +256,7 @@ def _collate(x):
             #   padded context length. this is useful to simplify the batching logic and more importantly to make
             #   automatic adaptive batches much much easier to implement
             # - any OOMs will happen right away rather than near the end
-            toks = self.tok_encode(x[0])
+            toks = self.tok_encode(str(x[0]))
             return -len(toks), x[0]
 
         # we group requests by their generation_kwargs,
@@ -270,8 +270,11 @@ def _collate(x):
             contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
             task = task[0]
             split = split[0]
-            batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]  # [B, N]
+            # batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]  # [B, N]
+            contexts_texts, batched_visuals = zip(*[context.get_text(lazy=False) for context in contexts])  # [B, N]
             flattened_visuals = self.flatten(batched_visuals)  # [B*N]
+            # batched_visuals = context.get_visions()  # [B, N]
+            # flattened_visuals = contexts[0].get_visions()  # [B*N]
             ############### for debugging ###################
             # TODO: remove this block
             # if len(visuals) > 1:
@@ -313,7 +316,7 @@ def _collate(x):
 
             question_input = []
 
-            for visual, context in zip(batched_visuals, contexts):
+            for visual, context in zip(batched_visuals, contexts_texts):
                 if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context:
                     """
                     Three senarios:

From 3e2b24f06c953516494ad0497a4bfe5456941e1c Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 18:24:15 +0800
Subject: [PATCH 24/38] Refactor image token handling in LMMS evaluation code

---
 lmms_eval/api/samplers.py |  7 +++++++
 lmms_eval/models/llava.py | 31 ++++++++++++++++---------------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index c6c519e5..bc645062 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -71,6 +71,13 @@ def add_question(self, doc, data_frame=None, index=None):
     def get_text(self, *, image_tokens="<image>", lazy=True):
         texts = []
         vision = []
+        already_have_images = False
+        for context in self.contexts:
+            if isinstance(context, str) and image_tokens in context:
+                already_have_images = True
+                break
+        if already_have_images:
+            image_tokens = ""
         for context in self.contexts:
             if isinstance(context, LazyLoadedImages):
                 if isinstance(image_tokens, str):
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 79ca0853..58ccb6e6 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -271,7 +271,7 @@ def _collate(x):
             task = task[0]
             split = split[0]
             # batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]  # [B, N]
-            contexts_texts, batched_visuals = zip(*[context.get_text(lazy=False) for context in contexts])  # [B, N]
+            contexts_texts, batched_visuals = zip(*[context.get_text(image_tokens=DEFAULT_IMAGE_TOKEN ,lazy=False) for context in contexts])  # [B, N]
             flattened_visuals = self.flatten(batched_visuals)  # [B*N]
             # batched_visuals = context.get_visions()  # [B, N]
             # flattened_visuals = contexts[0].get_visions()  # [B*N]
@@ -317,20 +317,21 @@ def _collate(x):
             question_input = []
 
             for visual, context in zip(batched_visuals, contexts_texts):
-                if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context:
-                    """
-                    Three senarios:
-                    1. No image, and there for, no image token should be added.
-                    2. image token is already specified in the context, so we don't need to add it.
-                    3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line.
-                    """
-                    image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN]
-                    image_tokens = " ".join(image_tokens)
-                    if isinstance(context, list):
-                        context = "".join(context)
-                    question = image_tokens + "\n" + context
-                else:
-                    question = context
+                question = context
+                # if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context:
+                #     """
+                #     Three senarios:
+                #     1. No image, and there for, no image token should be added.
+                #     2. image token is already specified in the context, so we don't need to add it.
+                #     3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line.
+                #     """
+                #     image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN]
+                #     image_tokens = " ".join(image_tokens)
+                #     if isinstance(context, list):
+                #         context = "".join(context)
+                #     question = image_tokens + "\n" + context
+                # else:
+                #     question = context
 
                 conv = conv_templates[self.conv_template].copy()
                 conv.append_message(conv.roles[0], question)

From 7b4b4fa06d7dbcdf2831a32642ba909b551edde3 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 18:26:21 +0800
Subject: [PATCH 25/38] lint

---
 lmms_eval/models/llava.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 58ccb6e6..1ee6cf7e 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -271,7 +271,7 @@ def _collate(x):
             task = task[0]
             split = split[0]
             # batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]  # [B, N]
-            contexts_texts, batched_visuals = zip(*[context.get_text(image_tokens=DEFAULT_IMAGE_TOKEN ,lazy=False) for context in contexts])  # [B, N]
+            contexts_texts, batched_visuals = zip(*[context.get_text(image_tokens=DEFAULT_IMAGE_TOKEN, lazy=False) for context in contexts])  # [B, N]
             flattened_visuals = self.flatten(batched_visuals)  # [B*N]
             # batched_visuals = context.get_visions()  # [B, N]
             # flattened_visuals = contexts[0].get_visions()  # [B*N]

From f180d084553ec23c8fbc0375fc2d54a3d27b1f68 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Sun, 31 Mar 2024 19:10:24 +0800
Subject: [PATCH 26/38] Refactor context and question addition methods

---
 lmms_eval/api/samplers.py | 17 +++++++++++++++--
 lmms_eval/models/llava.py | 14 +++++++-------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index bc645062..e0268f02 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -10,6 +10,15 @@ def __init__(self, data_frame, index):
     def get_images(self, doc_to_visual):
         return doc_to_visual(self.data_frame[self.index])
 
+from abc import ABC, abstractmethod
+
+class ContextProcessors(ABC):
+    @abstractmethod
+    def process(self, question, answer = None):
+        raise NotImplementedError
+    
+    def __call__(self, question, answer = None):
+        return self.process(question, answer)
 
 class Context(object):
     def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n", description=None):
@@ -43,7 +52,7 @@ def get_target(self, doc):
             else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
         )
 
-    def add_in_context_example(self, doc, data_frame=None, index=None):
+    def add_in_context_example(self, doc, data_frame=None, index=None, context_processors: Optional[ContextProcessors] = None):
         question = self.get_question(doc)
         if data_frame and index:
             visual = LazyLoadedImages(data_frame, index)
@@ -52,12 +61,14 @@ def add_in_context_example(self, doc, data_frame=None, index=None):
         target = self.doc_to_target(doc)
         if visual:
             self.contexts.append(visual)
+        if context_processors:
+            question, target = context_processors(question, target)
         self.contexts.append(question)
         self.contexts.append(self.target_delimiter)
         self.contexts.append(target)
         self.contexts.append(self.few_shot_delimiter)
 
-    def add_question(self, doc, data_frame=None, index=None):
+    def add_question(self, doc, data_frame=None, index=None, context_processors: Optional[ContextProcessors] = None):
         question = self.get_question(doc)
         if data_frame and index:
             visual = LazyLoadedImages(data_frame, index)
@@ -65,6 +76,8 @@ def add_question(self, doc, data_frame=None, index=None):
             visual = None
         if visual:
             self.contexts.append(visual)
+        if context_processors:
+            question, _ = context_processors(question)
         self.contexts.append(question)
         self.contexts.append(self.target_delimiter)
 
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 1ee6cf7e..7d7896bd 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -316,8 +316,7 @@ def _collate(x):
 
             question_input = []
 
-            for visual, context in zip(batched_visuals, contexts_texts):
-                question = context
+            for context in contexts_texts:
                 # if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context:
                 #     """
                 #     Three senarios:
@@ -333,11 +332,12 @@ def _collate(x):
                 # else:
                 #     question = context
 
-                conv = conv_templates[self.conv_template].copy()
-                conv.append_message(conv.roles[0], question)
-                conv.append_message(conv.roles[1], None)
-                prompt_question = conv.get_prompt()
-                question_input.append(prompt_question)
+                # conv = conv_templates[self.conv_template].copy()
+                # conv.append_message(conv.roles[0], question)
+                # conv.append_message(conv.roles[1], None)
+                # prompt_question = conv.get_prompt()
+                # question_input.append(prompt_question)
+                question_input.append(contexts)
 
             # The above for loop has bugs. When there is no visuals, e.g. pure text,
             # there will be no for loop execute resulting in an empty question_input (because no visuals)

From 4e1188a39c6f406cbd9914b1d4536eceaf12656c Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Tue, 2 Apr 2024 00:25:09 +0800
Subject: [PATCH 27/38] Refactor code to improve readability and add new
 features

---
 lmms_eval/api/samplers.py | 91 +++++++++++++++++++++++++++------------
 lmms_eval/models/llava.py | 25 ++++++++---
 2 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index e0268f02..6fcfc598 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -1,24 +1,65 @@
 import datasets
 from typing import Callable, Iterable, Optional
-
-
-class LazyLoadedImages(object):
-    def __init__(self, data_frame, index):
-        self.data_frame: datasets.Dataset = data_frame
-        self.index = index
-
-    def get_images(self, doc_to_visual):
-        return doc_to_visual(self.data_frame[self.index])
-
 from abc import ABC, abstractmethod
 
-class ContextProcessors(ABC):
+class ContextObject(ABC):
     @abstractmethod
-    def process(self, question, answer = None):
+    def get_text(self):
         raise NotImplementedError
     
-    def __call__(self, question, answer = None):
-        return self.process(question, answer)
+    def __str__(self):
+        return self.get_text() 
+
+class QAPairs(ContextObject):
+    def __init__(self, question: str, answer: Optional[str] = None, delimiter="\n", role_question: str = "USER: ", role_answer: str = "ASSISTANT: "):
+        self.question = question
+        self.answer = answer
+        self.delimiter = delimiter
+        self.role_question = role_question
+        self.role_answer = role_answer
+    
+    def get_text(self):
+        if self.answer is None:
+            return self.role_question + self.question + self.delimiter
+        else:
+            return self.role_question + self.question + self.delimiter + self.role_answer + self.answer
+
+class LazyLoadedImages(ContextObject):
+    def __init__(self, data_frame, index, doc_to_visual: Callable, image_tokens="<image>"):
+        self.data_frame: datasets.Dataset = data_frame
+        self.index = index
+        self.image_lens = None
+        self.images = None
+        self.doc_to_visual = doc_to_visual
+        self.image_tokens = image_tokens
+
+    def get_images(self, lazy_save=False):
+        if self.images is not None:
+            return self.images
+        images = self.doc_to_visual(self.data_frame[self.index])
+        self.image_lens = len(images)
+        if lazy_save:
+            self.images = images
+        return images
+    
+    def get_num_images(self, lazy_save=False):
+        if self.image_lens is None:
+            images = self.get_images(self.doc_to_visual)
+            if lazy_save:
+                self.images = images
+            self.image_lens = len(images)
+        return self.image_lens
+
+    def clear(self, clear_all = False):
+        self.images = None
+        if clear_all:
+            self.image_lens = None
+    
+    def get_text(self, lazy: bool = True):
+        if lazy:
+            return self.image_tokens
+        else:
+            return " ".join([self.image_tokens] * self.get_num_images())
 
 class Context(object):
     def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n", description=None):
@@ -52,33 +93,27 @@ def get_target(self, doc):
             else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
         )
 
-    def add_in_context_example(self, doc, data_frame=None, index=None, context_processors: Optional[ContextProcessors] = None):
+    def add_in_context_example(self, doc, data_frame=None, index=None):
         question = self.get_question(doc)
         if data_frame and index:
-            visual = LazyLoadedImages(data_frame, index)
+            visual = LazyLoadedImages(data_frame, index, self.doc_to_visual)
         else:
             visual = None
         target = self.doc_to_target(doc)
         if visual:
             self.contexts.append(visual)
-        if context_processors:
-            question, target = context_processors(question, target)
-        self.contexts.append(question)
-        self.contexts.append(self.target_delimiter)
-        self.contexts.append(target)
+        self.contexts.append(QAPairs(question, target, self.target_delimiter))
         self.contexts.append(self.few_shot_delimiter)
 
-    def add_question(self, doc, data_frame=None, index=None, context_processors: Optional[ContextProcessors] = None):
+    def add_question(self, doc, data_frame=None, index=None):
         question = self.get_question(doc)
         if data_frame and index:
-            visual = LazyLoadedImages(data_frame, index)
+            visual = LazyLoadedImages(data_frame, index, self.doc_to_visual)
         else:
             visual = None
         if visual:
             self.contexts.append(visual)
-        if context_processors:
-            question, _ = context_processors(question)
-        self.contexts.append(question)
+        self.contexts.append(QAPairs(question))
         self.contexts.append(self.target_delimiter)
 
     def get_text(self, *, image_tokens="<image>", lazy=True):
@@ -103,14 +138,14 @@ def get_text(self, *, image_tokens="<image>", lazy=True):
                 else:
                     texts.append(image_tokens(context))
             else:
-                texts.append(context)
+                texts.append(str(context))
         if lazy:
             return "".join(texts)
         else:
             return "".join(texts), vision
 
     def get_visions(self):
-        return [context.get_images(self.doc_to_visual) for context in self.contexts if isinstance(context, LazyLoadedImages)]
+        return sum([context.get_images(self.doc_to_visual) for context in self.contexts if isinstance(context, LazyLoadedImages)], start = [])
 
     def extend(self, context):
         if isinstance(context, list):
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 7d7896bd..93da449f 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -271,7 +271,8 @@ def _collate(x):
             task = task[0]
             split = split[0]
             # batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]  # [B, N]
-            contexts_texts, batched_visuals = zip(*[context.get_text(image_tokens=DEFAULT_IMAGE_TOKEN, lazy=False) for context in contexts])  # [B, N]
+            batched_visuals = [context.get_visions() for context in contexts]  # [B, N]
+            # contexts_texts, batched_visuals = zip(*[context.get_text(image_tokens=DEFAULT_IMAGE_TOKEN, lazy=False) for context in contexts])  # [B, N]
             flattened_visuals = self.flatten(batched_visuals)  # [B*N]
             # batched_visuals = context.get_visions()  # [B, N]
             # flattened_visuals = contexts[0].get_visions()  # [B*N]
@@ -316,7 +317,7 @@ def _collate(x):
 
             question_input = []
 
-            for context in contexts_texts:
+            for context in contexts:
                 # if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context:
                 #     """
                 #     Three senarios:
@@ -332,12 +333,24 @@ def _collate(x):
                 # else:
                 #     question = context
 
-                # conv = conv_templates[self.conv_template].copy()
+                conv = conv_templates[self.conv_template].copy()
+
+                num_image_tokens = 0
+                from lmms_eval.api.samplers import LazyLoadedImages, QAPairs
+                for obj in context.contexts:
+                    if isinstance(obj, LazyLoadedImages):
+                        num_image_tokens += obj.get_num_images()
+                    elif isinstance(obj, QAPairs):
+                        question = " ".join(num_image_tokens * [DEFAULT_IMAGE_TOKEN]) + "\n" + obj.question
+                        answer = obj.answer
+                        conv.append_message(conv.roles[0], question)
+                        conv.append_message(conv.roles[1], answer)
+                        
+
                 # conv.append_message(conv.roles[0], question)
                 # conv.append_message(conv.roles[1], None)
-                # prompt_question = conv.get_prompt()
-                # question_input.append(prompt_question)
-                question_input.append(contexts)
+                prompt_question = conv.get_prompt()
+                question_input.append(prompt_question)
 
             # The above for loop has bugs. When there is no visuals, e.g. pure text,
             # there will be no for loop execute resulting in an empty question_input (because no visuals)

From 61236aa78d1940139d2867c9738f36ca60f0e4fc Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Tue, 2 Apr 2024 00:31:26 +0800
Subject: [PATCH 28/38] lint

---
 lmms_eval/api/samplers.py | 18 +++++++++++-------
 lmms_eval/models/llava.py |  7 +++++--
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index 6fcfc598..6683bba9 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -2,13 +2,15 @@
 from typing import Callable, Iterable, Optional
 from abc import ABC, abstractmethod
 
+
 class ContextObject(ABC):
     @abstractmethod
     def get_text(self):
         raise NotImplementedError
-    
+
     def __str__(self):
-        return self.get_text() 
+        return self.get_text()
+
 
 class QAPairs(ContextObject):
     def __init__(self, question: str, answer: Optional[str] = None, delimiter="\n", role_question: str = "USER: ", role_answer: str = "ASSISTANT: "):
@@ -17,13 +19,14 @@ def __init__(self, question: str, answer: Optional[str] = None, delimiter="\n",
         self.delimiter = delimiter
         self.role_question = role_question
         self.role_answer = role_answer
-    
+
     def get_text(self):
         if self.answer is None:
             return self.role_question + self.question + self.delimiter
         else:
             return self.role_question + self.question + self.delimiter + self.role_answer + self.answer
 
+
 class LazyLoadedImages(ContextObject):
     def __init__(self, data_frame, index, doc_to_visual: Callable, image_tokens="<image>"):
         self.data_frame: datasets.Dataset = data_frame
@@ -41,7 +44,7 @@ def get_images(self, lazy_save=False):
         if lazy_save:
             self.images = images
         return images
-    
+
     def get_num_images(self, lazy_save=False):
         if self.image_lens is None:
             images = self.get_images(self.doc_to_visual)
@@ -50,17 +53,18 @@ def get_num_images(self, lazy_save=False):
             self.image_lens = len(images)
         return self.image_lens
 
-    def clear(self, clear_all = False):
+    def clear(self, clear_all=False):
         self.images = None
         if clear_all:
             self.image_lens = None
-    
+
     def get_text(self, lazy: bool = True):
         if lazy:
             return self.image_tokens
         else:
             return " ".join([self.image_tokens] * self.get_num_images())
 
+
 class Context(object):
     def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n", description=None):
         self.task = task
@@ -145,7 +149,7 @@ def get_text(self, *, image_tokens="<image>", lazy=True):
             return "".join(texts), vision
 
     def get_visions(self):
-        return sum([context.get_images(self.doc_to_visual) for context in self.contexts if isinstance(context, LazyLoadedImages)], start = [])
+        return sum([context.get_images(self.doc_to_visual) for context in self.contexts if isinstance(context, LazyLoadedImages)], start=[])
 
     def extend(self, context):
         if isinstance(context, list):
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 93da449f..1f6a88c0 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -337,15 +337,18 @@ def _collate(x):
 
                 num_image_tokens = 0
                 from lmms_eval.api.samplers import LazyLoadedImages, QAPairs
+
                 for obj in context.contexts:
                     if isinstance(obj, LazyLoadedImages):
                         num_image_tokens += obj.get_num_images()
                     elif isinstance(obj, QAPairs):
-                        question = " ".join(num_image_tokens * [DEFAULT_IMAGE_TOKEN]) + "\n" + obj.question
+                        if num_image_tokens == 0:
+                            question = obj.question
+                        else:
+                            question = " ".join(num_image_tokens * [DEFAULT_IMAGE_TOKEN]) + "\n" + obj.question
                         answer = obj.answer
                         conv.append_message(conv.roles[0], question)
                         conv.append_message(conv.roles[1], answer)
-                        
 
                 # conv.append_message(conv.roles[0], question)
                 # conv.append_message(conv.roles[1], None)

From aeaefc2d1f10d825ba210c6213a5c5b0e9c9f511 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Tue, 2 Apr 2024 00:37:37 +0800
Subject: [PATCH 29/38] why so many bugs

---
 lmms_eval/models/llava.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 1f6a88c0..ea5e7d03 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -402,6 +402,7 @@ def _collate(x):
                 text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)
             except Exception as e:
                 eval_logger.error(f"Error {e} in generating")
+                e.with_traceback()
                 cont = ""
                 text_outputs = [""]
 

From 6812878cadbd3994c0c8a67883fb8faa42f9a517 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Tue, 2 Apr 2024 01:05:46 +0800
Subject: [PATCH 30/38] fix bug

---
 lmms_eval/api/samplers.py |  2 +-
 lmms_eval/api/task.py     |  2 +-
 lmms_eval/models/llava.py | 20 ++++++++++++--------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index 6683bba9..e1a3dda2 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -118,7 +118,7 @@ def add_question(self, doc, data_frame=None, index=None):
         if visual:
             self.contexts.append(visual)
         self.contexts.append(QAPairs(question))
-        self.contexts.append(self.target_delimiter)
+        # self.contexts.append(self.target_delimiter)
 
     def get_text(self, *, image_tokens="<image>", lazy=True):
         texts = []
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index cc359962..0631fed9 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -791,7 +791,7 @@ def fewshot_context(self, doc_id, num_fewshot, split):
         labeled_examples = Context(self, self.config.fewshot_delimiter, self.config.target_delimiter, self.config.description)
         if num_fewshot != 0:
             labeled_examples.extend(self.sampler.get_context(doc, num_fewshot))
-        labeled_examples.add_question(doc)
+        labeled_examples.add_question(doc, self.test_docs(), doc_id)
         return labeled_examples
 
     def apply_filters(self):
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index ea5e7d03..a830fd8d 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -349,6 +349,7 @@ def _collate(x):
                         answer = obj.answer
                         conv.append_message(conv.roles[0], question)
                         conv.append_message(conv.roles[1], answer)
+                        num_image_tokens = 0
 
                 # conv.append_message(conv.roles[0], question)
                 # conv.append_message(conv.roles[1], None)
@@ -358,14 +359,17 @@ def _collate(x):
             # The above for loop has bugs. When there is no visuals, e.g. pure text,
             # there will be no for loop execute resulting in an empty question_input (because no visuals)
             # Scenario 1 won't even be execute
-            if len(flattened_visuals) == 0:
-                for context in contexts:
-                    question = context
-                    conv = conv_templates[self.conv_template].copy()
-                    conv.append_message(conv.roles[0], question)
-                    conv.append_message(conv.roles[1], None)
-                    prompt_question = conv.get_prompt()
-                    question_input.append(prompt_question)
+            # if len(flattened_visuals) == 0:
+            #     for context in contexts:
+            #         question = context
+            #         conv = conv_templates[self.conv_template].copy()
+            #         conv.append_message(conv.roles[0], question)
+            #         conv.append_message(conv.roles[1], None)
+            #         try:
+            #             prompt_question = conv.get_prompt()
+            #         except Exception as e:
+            #             pass
+            #         question_input.append(prompt_question)
 
             # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
             # preconfigure gen_kwargs with defaults

From 0b51d45c5646460c0e5141c413557b23ba9d9153 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Tue, 2 Apr 2024 01:07:43 +0800
Subject: [PATCH 31/38] llava-textvqa done

---
 lmms_eval/api/task.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 0631fed9..89d7d611 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -552,11 +552,11 @@ def __init__(self, model_name) -> None:  # TODO no super() call here
         else:
             self._filters = [build_filter_ensemble("none", [["take_first", None]])]
         ##########################################
-        # TODO: for test, will delete later
-        if self.config.task == "textvqa_test":
-            pass
-        else:
-            pass
+        # # TODO: for test, will delete later
+        # if self.config.task == "textvqa_test":
+        #     pass
+        # else:
+        #     pass
         ###########################################
         if self.config.fewshot_config is not None:
             try:

From 7607c2808ea7ded9b045bc36107e91342d1c906d Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Tue, 2 Apr 2024 01:22:39 +0800
Subject: [PATCH 32/38] Update construct_requests method signature

---
 lmms_eval/api/task.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 89d7d611..ec1b85b3 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -22,7 +22,7 @@
 from lmms_eval import utils
 from lmms_eval.api import samplers
 from lmms_eval.api.instance import Instance
-from lmms_eval.api.samplers import FewShotDataset
+from lmms_eval.api.samplers import FewShotDataset, Context
 
 from lmms_eval.filters import build_filter_ensemble
 from lmms_eval.api.registry import (
@@ -935,7 +935,7 @@ def doc_to_choice(self, doc: Any) -> List[str]:
         else:
             raise TypeError
 
-    def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
+    def construct_requests(self, doc_id: int, ctx: Context, **kwargs) -> Union[List[Instance], Instance]:
         split = kwargs.get("split")
         kwargs.pop("split")
         if self.OUTPUT_TYPE == "loglikelihood":

From d5ac6249c39a7e217d3ef798c84de035422ec7ee Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Tue, 2 Apr 2024 04:17:56 +0800
Subject: [PATCH 33/38] make contexts lists only have qa pairs

---
 lmms_eval/api/samplers.py | 222 +++++++++++++++++++++++---------------
 lmms_eval/models/llava.py |  23 ++--
 2 files changed, 147 insertions(+), 98 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index e1a3dda2..c64b2839 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -1,33 +1,9 @@
 import datasets
-from typing import Callable, Iterable, Optional
+from typing import Callable, Iterable, Optional, List
 from abc import ABC, abstractmethod
 
 
-class ContextObject(ABC):
-    @abstractmethod
-    def get_text(self):
-        raise NotImplementedError
-
-    def __str__(self):
-        return self.get_text()
-
-
-class QAPairs(ContextObject):
-    def __init__(self, question: str, answer: Optional[str] = None, delimiter="\n", role_question: str = "USER: ", role_answer: str = "ASSISTANT: "):
-        self.question = question
-        self.answer = answer
-        self.delimiter = delimiter
-        self.role_question = role_question
-        self.role_answer = role_answer
-
-    def get_text(self):
-        if self.answer is None:
-            return self.role_question + self.question + self.delimiter
-        else:
-            return self.role_question + self.question + self.delimiter + self.role_answer + self.answer
-
-
-class LazyLoadedImages(ContextObject):
+class LazyLoadedImages(object):
     def __init__(self, data_frame, index, doc_to_visual: Callable, image_tokens="<image>"):
         self.data_frame: datasets.Dataset = data_frame
         self.index = index
@@ -65,6 +41,80 @@ def get_text(self, lazy: bool = True):
             return " ".join([self.image_tokens] * self.get_num_images())
 
 
+class QAPairs(object):
+    def __init__(
+        self,
+        data_frame,
+        index,
+        *,
+        doc=None,
+        include_answer: bool = True,
+        doc_to_text: Callable,
+        doc_to_target: Optional[Callable] = None,
+        doc_to_choice: Optional[Callable] = None,
+        doc_to_visual: Optional[Callable] = None,
+        target_delimiter="\n",
+        delimiter="\n",
+        image_tokens="<image>",
+        role_question="USER: ",
+        role_answer="ASSISTANT: ",
+        config=None,
+    ):
+        self.data_frame: datasets.Dataset = data_frame
+        self.index = index
+        self.target_delimiter = target_delimiter
+        self.doc_to_text = doc_to_text
+        self.doc_to_target = doc_to_target
+        self.doc_to_choice = doc_to_choice
+        self.delimiter = delimiter
+        if doc_to_visual:
+            self.vision = LazyLoadedImages(data_frame, index, doc_to_visual, image_tokens)
+        else:
+            self.vision = None
+        self.role_question = role_question
+        self.role_answer = role_answer
+        if doc is None:
+            doc = data_frame[index]
+        self.config = config
+        self.question = self._get_question(doc)
+        self.answer = self._get_target(doc) if include_answer else None
+
+    def _get_question(self, doc):
+        text = self.doc_to_text(doc)
+        return text if (self.doc_to_choice is None or isinstance(text, str)) else self.doc_to_choice(doc)[text]
+
+    def _get_target(self, doc):
+        return (
+            str(self.doc_to_target(doc)[0])
+            if type(self.doc_to_target(doc)) is list
+            else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
+        )
+
+    def get_text(self):
+        if self.answer is None:
+            return self.role_question + self.question + self.delimiter
+        else:
+            return self.role_question + self.question + self.delimiter + self.role_answer + self.answer
+    
+    def __str__(self):
+        return self.get_text()
+
+    def get_visions(self):
+        if self.vision:
+            return self.vision.get_images()
+        else:
+            return []
+    
+    def already_have_image_token(self, image_token):
+        return image_token in self.question or (self.answer and image_token in self.answer)
+
+    def num_images(self):
+        if self.vision:
+            return self.vision.get_num_images()
+        else:
+            return 0
+
+
 class Context(object):
     def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n", description=None):
         self.task = task
@@ -78,78 +128,78 @@ def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str
         self.target_delimiter = target_delimiter
         self.few_shot_delimiter = few_shot_delimiter
 
-        self.contexts = []
+        self.contexts: List[QAPairs] = []
 
-        if description:
-            self.add_description(description)
+        self.description = description
 
     def add_description(self, description):
-        self.contexts = [description] + self.contexts
-
-    def get_question(self, doc):
-        text = self.doc_to_text(doc)
-        return text if (self.doc_to_choice is None or isinstance(text, str)) else self.doc_to_choice(doc)[text]
-
-    def get_target(self, doc):
-        return (
-            str(self.doc_to_target(doc)[0])
-            if type(self.doc_to_target(doc)) is list
-            else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
+        self.description = description
+    
+    def add_in_context_example(self, doc, data_frame, index):
+        # question = self.get_question(doc)
+        # if data_frame and index:
+        #     visual = LazyLoadedImages(data_frame, index, self.doc_to_visual)
+        # else:
+        #     visual = None
+        # target = self.doc_to_target(doc)
+        # if visual:
+        #     self.contexts.append(visual)
+        self.contexts.append(
+            QAPairs(
+                data_frame,
+                index,
+                doc=doc,
+                doc_to_text=self.doc_to_text,
+                doc_to_target=self.doc_to_target,
+                doc_to_choice=self.doc_to_choice,
+                doc_to_visual=self.doc_to_visual,
+                delimiter=self.target_delimiter,
+                config=self.config,
+            )
         )
-
-    def add_in_context_example(self, doc, data_frame=None, index=None):
-        question = self.get_question(doc)
-        if data_frame and index:
-            visual = LazyLoadedImages(data_frame, index, self.doc_to_visual)
-        else:
-            visual = None
-        target = self.doc_to_target(doc)
-        if visual:
-            self.contexts.append(visual)
-        self.contexts.append(QAPairs(question, target, self.target_delimiter))
-        self.contexts.append(self.few_shot_delimiter)
+        # self.contexts.append(self.few_shot_delimiter)
 
     def add_question(self, doc, data_frame=None, index=None):
-        question = self.get_question(doc)
-        if data_frame and index:
-            visual = LazyLoadedImages(data_frame, index, self.doc_to_visual)
-        else:
-            visual = None
-        if visual:
-            self.contexts.append(visual)
-        self.contexts.append(QAPairs(question))
+        # question = self.get_question(doc)
+        # if data_frame and index:
+        #     visual = LazyLoadedImages(data_frame, index, self.doc_to_visual)
+        # else:
+        #     visual = None
+        # if visual:
+        #     self.contexts.append(visual)
+        self.contexts.append(
+            QAPairs(
+                data_frame,
+                index,
+                doc=doc,
+                doc_to_text=self.doc_to_text,
+                doc_to_target=self.doc_to_target,
+                doc_to_choice=self.doc_to_choice,
+                doc_to_visual=self.doc_to_visual,
+                delimiter=self.target_delimiter,
+                include_answer=False,
+                config=self.config,
+            )
+        )
         # self.contexts.append(self.target_delimiter)
+    
+    def already_have_image_token(self, image_token):
+        for context in self.contexts:
+            if context.already_have_image_token(image_token):
+                return True
+        return False
 
-    def get_text(self, *, image_tokens="<image>", lazy=True):
+    def get_text(self):
         texts = []
-        vision = []
-        already_have_images = False
         for context in self.contexts:
-            if isinstance(context, str) and image_tokens in context:
-                already_have_images = True
-                break
-        if already_have_images:
-            image_tokens = ""
-        for context in self.contexts:
-            if isinstance(context, LazyLoadedImages):
-                if isinstance(image_tokens, str):
-                    if lazy:
-                        texts.append(image_tokens)
-                    else:
-                        now_vision = context.get_images(self.doc_to_visual)
-                        vision.extend(now_vision)
-                        texts.append(image_tokens * len(now_vision))
-                else:
-                    texts.append(image_tokens(context))
-            else:
-                texts.append(str(context))
-        if lazy:
-            return "".join(texts)
-        else:
-            return "".join(texts), vision
+            texts.append(str(context))
+        return "".join(texts)
 
     def get_visions(self):
-        return sum([context.get_images(self.doc_to_visual) for context in self.contexts if isinstance(context, LazyLoadedImages)], start=[])
+        visions = []
+        for context in self.contexts:
+            visions.extend(context.get_visions())
+        return visions
 
     def extend(self, context):
         if isinstance(context, list):
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index a830fd8d..80134af7 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -335,21 +335,20 @@ def _collate(x):
 
                 conv = conv_templates[self.conv_template].copy()
 
-                num_image_tokens = 0
                 from lmms_eval.api.samplers import LazyLoadedImages, QAPairs
 
+                already_have_image_token = context.already_have_image_token(DEFAULT_IMAGE_TOKEN)
+                
                 for obj in context.contexts:
-                    if isinstance(obj, LazyLoadedImages):
-                        num_image_tokens += obj.get_num_images()
-                    elif isinstance(obj, QAPairs):
-                        if num_image_tokens == 0:
-                            question = obj.question
-                        else:
-                            question = " ".join(num_image_tokens * [DEFAULT_IMAGE_TOKEN]) + "\n" + obj.question
-                        answer = obj.answer
-                        conv.append_message(conv.roles[0], question)
-                        conv.append_message(conv.roles[1], answer)
-                        num_image_tokens = 0
+                    if already_have_image_token or obj.num_images() == 0:
+                        question = obj.question
+                    else:
+                        question = " ".join(obj.num_images() * [DEFAULT_IMAGE_TOKEN]) + "\n" + obj.question
+                    if context.description:
+                        question = context.description + "\n" + question
+                    answer = obj.answer
+                    conv.append_message(conv.roles[0], question)
+                    conv.append_message(conv.roles[1], answer)
 
                 # conv.append_message(conv.roles[0], question)
                 # conv.append_message(conv.roles[1], None)

From 89513b7b602280cf8a82cc45d23ab4316d3733c2 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Wed, 3 Apr 2024 11:26:28 +0800
Subject: [PATCH 34/38] add vila

---
 lmms_eval/models/gpt4v.py | 17 ++++++++++-------
 lmms_eval/models/llava.py |  6 ++++++
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py
index d2ec2025..68d6d20b 100644
--- a/lmms_eval/models/gpt4v.py
+++ b/lmms_eval/models/gpt4v.py
@@ -12,6 +12,7 @@
 from lmms_eval.api.model import lmms
 from lmms_eval.api.registry import register_model
 from lmms_eval import utils
+from lmms_eval.api.samplers import Context
 
 from PIL import Image
 
@@ -65,17 +66,19 @@ def generate_until(self, requests) -> List[str]:
 
         for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
             # encode, pad, and truncate contexts for this batch
-            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
-            visuals = self.flatten(visuals)
-            imgs = []
-            for visual in visuals:
-                img = self.encode_image(visual)
-                imgs.append(img)
+            # visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            # visuals = contexts.get_visions()
+            # visuals = self.flatten(visuals)
+            # imgs = []
+            # for visual in visuals:
+            #     img = self.encode_image(visual)
+            #     imgs.append(img)
 
             payload = {"model": "gpt-4-vision-preview", "messages": []}
             response_json = {"role": "user", "content": []}
             # When there is no image token in the context, append the image to the text
-            if self.image_token not in contexts:
+            image_token_in_context = contexts.already_have_image_token(self.image_token)
+            if image_token_in_context:
                 payload["messages"].append(deepcopy(response_json))
                 payload["messages"][0]["content"].append({"type": "text", "text": contexts})
                 for img in imgs:
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 80134af7..3c2f9e9f 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -73,6 +73,12 @@ def __init__(
             self.device_map = device_map
 
         self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, get_model_name_from_path(pretrained), device_map=self.device_map, use_flash_attention_2=use_flash_attention_2)
+        if self._image_processor is None:
+            vision_tower = self._model.get_vision_tower()
+            if not vision_tower.is_loaded:
+                vision_tower.load_model()
+            vision_tower.to(device=device, dtype=torch.float16)
+            self._image_processor = vision_tower.image_processor
         self._config = self._model.config
         self.model.eval()
         self.model.tie_weights()

From 6ae82fe49de7b7ae06a90881230e2014b443b83e Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Thu, 4 Apr 2024 10:40:40 +0800
Subject: [PATCH 35/38] Refactor LMMS API and models

---
 lmms_eval/api/samplers.py | 27 ++++++++++++++---
 lmms_eval/models/gpt4v.py | 62 ++++++++++++++++++++++++++-------------
 lmms_eval/models/llava.py |  2 +-
 3 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index c64b2839..cecc7219 100644
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -95,7 +95,7 @@ def get_text(self):
             return self.role_question + self.question + self.delimiter
         else:
             return self.role_question + self.question + self.delimiter + self.role_answer + self.answer
-    
+
     def __str__(self):
         return self.get_text()
 
@@ -104,7 +104,7 @@ def get_visions(self):
             return self.vision.get_images()
         else:
             return []
-    
+
     def already_have_image_token(self, image_token):
         return image_token in self.question or (self.answer and image_token in self.answer)
 
@@ -114,6 +114,25 @@ def num_images(self):
         else:
             return 0
 
+    def get_question_list(self, image_token, image_in_front=False):
+        questions = []
+        visions = self.get_visions()
+        if visions and self.already_have_image_token(image_token):
+            q_list = self.question.split(image_token)
+            for q, img in zip(q_list[:-1], visions):
+                if q != "":
+                    questions.append(q)
+                questions.append(img)
+            if q_list[-1] != "":
+                questions.append(q_list[-1])
+        else:
+            if image_in_front and visions:
+                questions.extend(visions)
+            questions.append(self.question)
+            if not image_in_front and visions:
+                questions.extend(visions)
+        return questions
+
 
 class Context(object):
     def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str = "\n", description=None):
@@ -134,7 +153,7 @@ def __init__(self, task, few_shot_delimiter: str = "\n\n", target_delimiter: str
 
     def add_description(self, description):
         self.description = description
-    
+
     def add_in_context_example(self, doc, data_frame, index):
         # question = self.get_question(doc)
         # if data_frame and index:
@@ -182,7 +201,7 @@ def add_question(self, doc, data_frame=None, index=None):
             )
         )
         # self.contexts.append(self.target_delimiter)
-    
+
     def already_have_image_token(self, image_token):
         for context in self.contexts:
             if context.already_have_image_token(image_token):
diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py
index 68d6d20b..9116f9f8 100644
--- a/lmms_eval/models/gpt4v.py
+++ b/lmms_eval/models/gpt4v.py
@@ -74,27 +74,46 @@ def generate_until(self, requests) -> List[str]:
             #     img = self.encode_image(visual)
             #     imgs.append(img)
 
-            payload = {"model": "gpt-4-vision-preview", "messages": []}
-            response_json = {"role": "user", "content": []}
+            # response_json = {"role": "user", "content": []}
             # When there is no image token in the context, append the image to the text
-            image_token_in_context = contexts.already_have_image_token(self.image_token)
-            if image_token_in_context:
-                payload["messages"].append(deepcopy(response_json))
-                payload["messages"][0]["content"].append({"type": "text", "text": contexts})
-                for img in imgs:
-                    payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
-            else:
-                contexts = contexts.split(self.image_token)
-                for idx, img in enumerate(imgs):
-                    payload["messages"].append(deepcopy(response_json))
-                    payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
-                    payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
-
-                # If n image tokens are in the contexts
-                # contexts will be splitted into n+1 chunks
-                # Manually add it into the payload
-                payload["messages"].append(deepcopy(response_json))
-                payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
+            messages = []
+            if contexts.description:
+                messages.append({"type": "system", "text": contexts.description})
+
+            for qa in contexts.contexts:
+                # content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+                # payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
+                content = []
+                questions = qa.get_question_list(self.image_token)
+                for q in questions:
+                    if isinstance(q, str):
+                        content.append({"type": "text", "text": q})
+                    elif isinstance(q, Image.Image):
+                        img = self.encode_image(q)
+                        content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+                messages.append({"role": "user", "content": content})
+                if qa.answer:
+                    messages.append({"role": "assistant", "content": [{"type": "text", "text": qa.answer}]})
+
+            payload = {"model": "gpt-4-vision-preview", "messages": messages}
+
+            # if image_token_in_context:
+            #     payload["messages"].append(deepcopy(response_json))
+            #     payload["messages"][0]["content"].append({"type": "text", "text": contexts})
+            #     for img in imgs:
+            #         payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+            # else:
+            #     contexts = contexts.split(self.image_token)
+            #     for idx, img in enumerate(imgs):
+            #         payload["messages"].append(deepcopy(response_json))
+            #         payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
+            #         payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+
+            # If n image tokens are in the contexts
+            # contexts will be splitted into n+1 chunks
+            # Manually add it into the payload
+            # payload["messages"].append(deepcopy(response_json))
+            # payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
 
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
@@ -112,6 +131,9 @@ def generate_until(self, requests) -> List[str]:
                 try:
                     response = url_requests.post(API_URL, headers=headers, json=payload, timeout=20)
                     response_data = response.json()
+                    
+                    if "error" in response_data:
+                        raise Exception(f"Error: {response_data['error']['message']}")
 
                     content = response_data["choices"][0]["message"]["content"].strip()
                     break  # If successful, break out of the loop
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 3c2f9e9f..e8052fea 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -344,7 +344,7 @@ def _collate(x):
                 from lmms_eval.api.samplers import LazyLoadedImages, QAPairs
 
                 already_have_image_token = context.already_have_image_token(DEFAULT_IMAGE_TOKEN)
-                
+
                 for obj in context.contexts:
                     if already_have_image_token or obj.num_images() == 0:
                         question = obj.question

From 7c03d1700ed011b1d488b16888f758ec608a3df3 Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Thu, 18 Apr 2024 03:31:24 +0800
Subject: [PATCH 36/38] add docvqa fewshot

---
 lmms_eval/tasks/docvqa/_default_template_docvqa_yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml b/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml
index 6e0cab68..a6d79436 100644
--- a/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml
+++ b/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml
@@ -17,3 +17,9 @@ model_specific_prompt_kwargs:
   qwen_vl:
     pre_prompt: ""
     post_prompt: " Answer:"
+fewshot_config:
+  fewshot_sampler: default
+  fewshot_dataset:
+    dataset_path: lmms-lab/DocVQA
+    dataset_name: DocVQA
+    split: val

From c688a72d8c16bc7f7c3da914e2a0db5d628098cb Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Thu, 18 Apr 2024 03:41:05 +0800
Subject: [PATCH 37/38] fix a small bug

---
 lmms_eval/tasks/docvqa/_default_template_docvqa_yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml b/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml
index a6d79436..920b89d1 100644
--- a/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml
+++ b/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml
@@ -22,4 +22,4 @@ fewshot_config:
   fewshot_dataset:
     dataset_path: lmms-lab/DocVQA
     dataset_name: DocVQA
-    split: val
+    split: validation

From d1ba5616fe2121f44617e4d7993905e2e0b38d8a Mon Sep 17 00:00:00 2001
From: Fanyi Pu <FPU001@e.ntu.edu.sg>
Date: Thu, 18 Apr 2024 03:49:25 +0800
Subject: [PATCH 38/38] Fix condition for checking accelerator.num_processes in
 Llava class

---
 lmms_eval/models/llava.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 531b85ea..0671dd15 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -88,7 +88,7 @@ def __init__(
         self.use_cache = use_cache
         self.truncate_context = truncate_context
         # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
-        if accelerator.num_processes > 1 and device_map == "":
+        if accelerator.num_processes > 1 and device_map == "auto":
             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
             # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
             # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works