[Memory issue] Solve memory issue for building context (#14)

* Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * Remove unused function llava_aggregation * Refractor llava-bench aggregation code * Add logs and scripts to .gitignore, and set image_aspect_ratio to original in scienceqa.yaml * Update generation parameters in scienceqa.yaml * Solve memory issue for building context * Solved gather result error * Update lmms_eval scienceqa_img config * Fixed nocaps store results * Revise seedbench prompt * Squashed commit of the following: commit c3cc24a Author: Zhang Peiyuan <[email protected]> Date: Wed Jan 24 14:07:36 2024 +0800 add mmmu (#15) * add mmme * black commit 0dbc5d1 Author: Li Bo <[email protected]> Date: Wed Jan 24 10:00:33 2024 +0800 [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions commit fec494d Author: kcz358 <[email protected]> Date: Tue Jan 23 19:17:40 2024 +0800 [Dataset] Add SEED-Bench, TextCaps, NoCaps (#12) * Change coco from print to logger * Add llava loglikelihood * Add Nocaps support * Fix pass through function * Add textcaps support * Fix textcaps eval image_id * Add seedbench support * Add seedbench ppl evaluation * black lint commit 4c3c2c6 Author: Li Bo <[email protected]> Date: Tue Jan 23 19:17:12 2024 +0800 [Datasets] Added POPE and Aligned. (#11) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function --------- Co-authored-by: Bo Li <[email protected]>
EvolvingLMMs-Lab · Jan 24, 2024 · 6d65a74 · 6d65a74
1 parent c3cc24a
commit 6d65a74
Show file tree

Hide file tree

Showing 13 changed files with 187 additions and 96 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,5 @@ temp
 # IPython
 profile_default/
 ipython_config.py
-logs/
+logs/
+scripts/
diff --git a/lmms_eval/api/instance.py b/lmms_eval/api/instance.py
@@ -5,7 +5,6 @@
 @dataclass
 class Instance:
     request_type: Literal["loglikelihood", "loglikelihood_rolling", "generate_until"]
-    doc: dict
     arguments: tuple
     idx: int
     metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
@@ -16,6 +15,7 @@ class Instance:
     task_name: str = None
     doc_id: str = None
     repeats: str = None
+    doc: dict = None
 
     def __post_init__(self) -> None:
         # unpack metadata field

diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
@@ -1,11 +1,13 @@
 import abc
 from dataclasses import dataclass, field, asdict
 
+import itertools
 import os
 import re
 import ast
 import logging
 import random
+from tqdm import tqdm
 
 import datasets
 import numpy as np
@@ -338,38 +340,38 @@ def build_all_requests(self, limit=None, rank=None, world_size=None) -> None:
         eval_logger.info(f"Building contexts for task on rank {rank}...")
 
         instances = []
-        for doc_id, doc in utils.create_iterator(enumerate(docs), rank, world_size, limit):
+        doc_id_iterator = utils.create_iterator([i for i in range(len(docs))], rank, world_size, limit)
+        doc_id_iterator, doc_id_iterator_counting = itertools.tee(doc_id_iterator)
+        total_docs = sum(1 for _ in doc_id_iterator_counting)
+        pbar = tqdm(total=total_docs, desc="Building context")
+        for doc_id in doc_id_iterator:
             # sample fewshot context #TODO: need to offset doc_id by rank now!
-            fewshot_ctx = self.fewshot_context(
-                doc,
-                0 if self.config.num_fewshot is None else self.config.num_fewshot,
-            )
+            fewshot_ctx = self.fewshot_context(doc_id, 0 if self.config.num_fewshot is None else self.config.num_fewshot, self.config.training_split if self.has_training_docs() else split)
 
             # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
-            inst = self.construct_requests(doc=doc, ctx=fewshot_ctx, metadata=(self.config["task"], doc_id, self.config.repeats), split=split)
+            inst = self.construct_requests(doc_id=doc_id, ctx=fewshot_ctx, metadata=(self.config["task"], doc_id, self.config.repeats), split=split)
 
             if not isinstance(inst, list):
                 inst = [inst]
 
             instances.extend(inst)
+            pbar.update(1)
 
         self._instances = instances
         assert len(self._instances) != 0, "task.build_requests() did not find any docs!"
 
     @abc.abstractmethod
-    def construct_requests(self, doc, ctx, **kwargs):
+    def construct_requests(self, doc_id, ctx, **kwargs):
         """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LMM.
 
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
+        :param doc_id: int
+            The index of a document within `self.test_docs()` or `self.validation_docs()`,
+            whichever is the main split used.
         :param ctx: str
             The context string, generated by fewshot_context. This includes the natural
             language description, as well as the few shot examples, and the question
             part of the document for `doc`.
-        :param doc_idx: int
-            The index of a document within `self.test_docs()` or `self.validation_docs()`,
-            whichever is the main split used.
         :param repeats: int
         TODO: update this docstring
             The number of times each instance in a dataset is inferred on. Defaults to 1,
@@ -421,18 +423,21 @@ def count_words(cls, doc):
     @utils.positional_deprecated
     def fewshot_context(
         self,
-        doc,
+        doc_id,
         num_fewshot,
+        split,
         rnd=random.Random(1234),
         description=None,
     ):
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
 
-        :param doc: str
-            The document as returned from training_docs, validation_docs, or test_docs.
+        :param doc_id: int
+            The document id as returned from training_docs, validation_docs, or test_docs.
         :param num_fewshot: int
             The number of fewshot examples to provide in the returned context string.
+        :param split: str
+            The split of the document to retrieve from the dataset
         :param rnd: random.Random
             The pseudo-random number generator used to randomly sample examples.
             WARNING: This is currently a required arg although it's optionalized with a default `None`.
@@ -444,6 +449,7 @@ def fewshot_context(
         assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
 
         description = description if description else ""
+        doc = self.dataset[split][doc_id]
 
         if num_fewshot == 0:
             labeled_examples = ""
@@ -676,18 +682,18 @@ def fewshot_docs(self):
             return super().fewshot_docs()
 
     @utils.positional_deprecated
-    def fewshot_context(self, doc, num_fewshot):
+    def fewshot_context(self, doc_id, num_fewshot, split):
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
 
-        :param doc: str
-            The document as returned from training_docs, validation_docs, or test_docs.
+        :param doc_id: str
+            The document id as returned from training_docs, validation_docs, or test_docs.
         :param num_fewshot: int
             The number of fewshot examples to provide in the returned context string.
         :returns: str
             The fewshot context.
         """
-
+        doc = self.dataset[split][doc_id]
         if num_fewshot == 0:
             # always prepend the (possibly empty) task description
             labeled_examples = self.config.description
@@ -840,26 +846,28 @@ def doc_to_choice(self, doc: Any) -> List[str]:
         else:
             raise TypeError
 
-    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
+    def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
+        split = kwargs.get("split")
+        kwargs.pop("split")
         if self.OUTPUT_TYPE == "loglikelihood":
-            arguments = (ctx, self.doc_to_target(doc), self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split"))
+            arguments = (ctx, self.doc_to_target, self.doc_to_visual, doc_id, self.config.task, split)
         elif self.OUTPUT_TYPE == "loglikelihood_rolling":
-            arguments = (self.doc_to_target(doc),)
+            arguments = (self.doc_to_target,)
         elif self.OUTPUT_TYPE == "multiple_choice":
+            doc = self.dataset[split][doc_id]
             choices = self.doc_to_choice(doc)
             target_delimiter = self.config.target_delimiter
             if self.multiple_input:
                 # If there are multiple inputs, choices are placed in the ctx
                 cont = self.doc_to_target(doc)
-                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for ctx in choices]
+                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, doc_id, self.config.task, split) for ctx in choices]
             else:
                 # Otherwise they are placed in the continuation
-                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split")) for cont in choices]
-            kwargs.pop("split")
+                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, doc_id, self.config.task, split) for cont in choices]
             request_list = [
                 Instance(
                     request_type="loglikelihood",
-                    doc=doc,
+                    # doc=doc,
                     arguments=arg,
                     idx=i,
                     **kwargs,
@@ -878,7 +886,7 @@ def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instan
                     [
                         Instance(
                             request_type="loglikelihood",
-                            doc=doc,
+                            # doc=doc,
                             arguments=("", "{}".format(choice)),
                             idx=i,
                             **kwargs,
@@ -889,9 +897,8 @@ def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instan
             return request_list
 
         elif self.OUTPUT_TYPE == "generate_until":
-            arguments = (ctx, self.config.generation_kwargs, self.doc_to_visual, kwargs.get("metadata")[1], self.config.task, kwargs.get("split"))
-        kwargs.pop("split")
-        return Instance(request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs)
+            arguments = (ctx, self.config.generation_kwargs, self.doc_to_visual, doc_id, self.config.task, split)
+        return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, **kwargs)
 
     def process_results(self, doc, results):
         if callable(self.config.process_results):

diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
@@ -325,7 +325,7 @@ def evaluate(
                         "doc_id": doc_id,
                         "doc": {k: v for k, v in doc.items() if "image" not in k},  # do not include image
                         "target": target,
-                        "arguments": [req.args[:2] for req in requests],  # do not include image
+                        "arguments": [tuple(a for a in req.args if isinstance(a, (int, str))) for req in requests],  # do not include image
                         "resps": [req.resps for req in requests],
                         "filtered_resps": [req.filtered_resps[key] for req in requests],
                     }

diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
@@ -145,8 +145,9 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         res = []
         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
 
-        for contexts, continuation, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+        for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
             # encode, pad, and truncate contexts for this batch
+            continuation = doc_to_target(self.task_dict[task][split][doc_id])
             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
             visuals = self.flatten(visuals)
             if visuals:

diff --git a/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml b/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml
@@ -17,17 +17,17 @@ generation_kwargs:
   num_beams: 1
 process_results: !function utils.llava_process_results
 metric_list:
+  - metric: gpt_eval_llava_all
+    aggregation: !function utils.llava_all_aggregation
+    higher_is_better: true
   - metric: gpt_eval_llava_conv
-    aggregation: !function utils.llava_aggregation
+    aggregation: !function utils.llava_conv_aggregation
     higher_is_better: true
   - metric: gpt_eval_llava_detail
-    aggregation: !function utils.llava_aggregation
+    aggregation: !function utils.llava_detail_aggregation
     higher_is_better: true
   - metric: gpt_eval_llava_complex
-    aggregation: !function utils.llava_aggregation
-    higher_is_better: true
-  - metric: gpt_eval_llava_all
-    aggregation: !function utils.llava_aggregation
+    aggregation: !function utils.llava_complex_aggregation
     higher_is_better: true
 metadata:
   version: 0.0