Joshua/vizwizvqa refactor (EvolvingLMMs-Lab#42)

* refactor vizwizvqa task * Merge commit '59c7d67077c315657a02bdee2eace0e64c1ee0d4' * Fix exact_match accuracy calculation in vizwiz_vqa_process_results * Update vizwiz_vqa tasks --------- Co-authored-by: Fanyi Pu <[email protected]>
kangreen0210 · Feb 6, 2024 · 058a7d4 · 058a7d4
1 parent 3b0e5e9
commit 058a7d4
Show file tree

Hide file tree

Showing 15 changed files with 174 additions and 342 deletions.
diff --git a/example_eval.yaml b/example_eval.yaml
@@ -1,6 +1,6 @@
 - model: llava
   model_args: pretrained=liuhaotian/llava-v1.5-7b
-  tasks: docvqa_val,docvqa_test,infovqa_val,infovqa_test
+  tasks: vizwiz_vqa
   batch_size: 1
   log_samples: true
   log_samples_suffix: debug

diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
@@ -179,7 +179,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         accelerator.wait_for_everyone()
         if is_main_process:
             wandb_logger.log_eval_result(results)
-            wandb_logger.write_to_report(results)
+            if wandb_logger.online():
+                wandb_logger.write_to_report(results)
             wandb_logger.finish()
         results_list.append(results)
 

diff --git a/lmms_eval/logging_utils.py b/lmms_eval/logging_utils.py
@@ -261,7 +261,7 @@ def write_to_report(self, results):
         report = self.wr.Report(
             project=self.run.project,
             entity=self.run.entity,
-            title=f"({datetime.now().strftime('%Y-%m-%d %H:%M:%S')}) xxx - Evaluation report",
+            title=f"({datetime.now().strftime('%Y-%m-%d %H:%M:%S')}) {self.run.id} - Evaluation report",
             description=f"Evaluation run by: {self.run.entity} logged to {self.run.url}",
         )
 
@@ -301,3 +301,6 @@ def write_to_report(self, results):
 
     def finish(self):
         self.run.finish()
+
+    def online(self):
+        return self.run.offline is False
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
@@ -3,3 +3,13 @@
 from .qwen_vl import Qwen_VL
 from .fuyu import Fuyu
 from .gpt4v import GPT4V
+
+import os
+
+try:
+    # enabling faster model download
+    import hf_transfer
+
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+except ImportError:
+    pass
diff --git a/lmms_eval/models/otterhd.py b/lmms_eval/models/otterhd.py
@@ -1,3 +1,4 @@
+from accelerate import Accelerator, DistributedType
 from transformers import FuyuForCausalLM, AutoTokenizer, FuyuImageProcessor, FuyuProcessor
 from lmms_eval.api.model import lmms
 from lmms_eval.api.registry import register_model
@@ -8,6 +9,13 @@
 from lmms_eval.api.instance import Instance
 from tqdm import tqdm
 
+import warnings
+import logging
+
+warnings.filterwarnings("ignore")
+
+eval_logger = logging.getLogger("lmms-eval")
+
 
 @register_model("otterhd")
 class OtterHD(lmms):
@@ -28,29 +36,86 @@ def __init__(
         # Do not use kwargs for now
         assert kwargs == {}, f"Unexpected kwargs: {kwargs}"
 
-        self.device = device if torch.cuda.is_available() else "cpu"
-        self.model = FuyuForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device)
+        accelerator = Accelerator()
+        if accelerator.num_processes > 1:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+        else:
+            self._device = device
+
+        self._model = FuyuForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self._device)
         self.model.eval()
-        self.tokenizer = AutoTokenizer.from_pretrained(pretrained)
+        self.model.tie_weights()
+        self._tokenizer = AutoTokenizer.from_pretrained(pretrained)
+        self._config = self.model.config
+
         height, width = map(int, resolution.split("x"))
         self.image_processor = FuyuImageProcessor(size={"height": height, "width": width})
         self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer)
         self.max_new_tokens = max_new_tokens
         self.batch_size_per_gpu = int(batch_size)
 
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [
+                DistributedType.FSDP,
+                DistributedType.MULTI_GPU,
+            ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            if accelerator.distributed_type == DistributedType.FSDP:
+                self._model = accelerator.prepare(self.model)
+            else:
+                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        else:
+            self.model.to(self._device)
+            self._rank = 0
+            self._word_size = 1
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
     @property
     def max_length(self):
         # Assuming max_length is the sum of max context tokens and max new tokens
         return self.tokenizer.model_max_length
 
-    # @property
-    # def max_gen_toks(self) -> int:
-    #     return self.max_new_tokens
-
     @property
     def batch_size(self):
         return self.batch_size_per_gpu
 
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
     def flatten(self, input, only_get_first=False):
         new_list = []
         for i in input:

diff --git a/lmms_eval/tasks/seedbench/utils.py b/lmms_eval/tasks/seedbench/utils.py
@@ -43,10 +43,12 @@ def seed_aggregation_result_all(results):
 
     return score
 
+
 def seed_doc_to_text_mc(doc):
     question = doc["question"]
     return f"{question} Answer :"
 
+
 def seed_doc_to_choice(doc):
     return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]]
 

diff --git a/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml b/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml
@@ -0,0 +1,10 @@
+dataset_path: lmms-lab/VizWiz-VQA
+output_type: generate_until
+doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual
+doc_to_text: !function utils.vizwiz_vqa_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/vizwiz_vqa/_generate_config.py b/lmms_eval/tasks/vizwiz_vqa/_generate_config.py
@@ -0,0 +1,25 @@
+import os
+import yaml
+
+splits = ["val", "test"]
+tasks = ["vqa"]
+
+if __name__ == "__main__":
+    dump_tasks = []
+    for task in tasks:
+        for split in splits:
+            yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
+            if split == "train":
+                yaml_dict.pop("group")
+            else:
+                dump_tasks.append(f"vizwiz_{task}_{split}")
+
+            save_path = f"./vizwiz_{task}_{split}.yaml"
+            print(f"Saving to {save_path}")
+            with open(save_path, "w") as f:
+                yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
+
+    group_dict = {"group": "vizwiz_vqa", "task": dump_tasks}
+
+    with open("./_vizwiz_vqa.yaml", "w") as f:
+        yaml.dump(group_dict, f, default_flow_style=False, indent=4)
diff --git a/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml b/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml
@@ -0,0 +1,4 @@
+group: vizwiz_vqa
+task:
+- vizwiz_vqa_val
+- vizwiz_vqa_test
diff --git a/lmms_eval/tasks/vizwizvqa_test/utils.py → lmms_eval/tasks/vizwiz_vqa/utils.py b/lmms_eval/tasks/vizwizvqa_test/utils.py → lmms_eval/tasks/vizwiz_vqa/utils.py
@@ -9,13 +9,6 @@
 
 eval_logger = logging.getLogger("lmms-eval")
 
-with open(pathlib.Path(__file__).parent / "vizwizvqa_test.yaml", "r") as f:
-    raw_data = f.readlines()
-    for i in range(len(raw_data)):
-        raw_data[i] = raw_data[i].replace("!function", "function")
-
-    config = yaml.safe_load("".join(raw_data))
-
 
 class EvalAIAnswerProcessor:
     CONTRACTIONS = {
@@ -223,11 +216,11 @@ def __call__(self, item):
         return item
 
 
-def vizwizvqa_doc_to_visual(doc):
+def vizwiz_vqa_doc_to_visual(doc):
     return [doc["image"].convert("RGB")]
 
 
-def vizwizvqa_process_results(doc, result):
+def vizwiz_vqa_process_results(doc, result):
     eval_ai_processor = EvalAIAnswerProcessor()
     assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
     resAns = eval_ai_processor(result[0])
@@ -250,23 +243,33 @@ def vizwizvqa_process_results(doc, result):
             accuracy = 0
 
     return {
-        # "exact_match": accuracy,
+        "exact_match": accuracy,
         "submission": {
             "image": f"{doc['question_id']}.jpg",
             "answer": resAns,
         },
     }
 
 
-def vizwizvqa_doc_to_text(doc):
+def vizwiz_vqa_process_results_test(doc, result):
+    res = vizwiz_vqa_process_results(doc, result)
+    return {"submission": res["submission"]}
+
+
+def vizwiz_vqa_process_results_val(doc, result):
+    res = vizwiz_vqa_process_results(doc, result)
+    return {"exact_match": res["exact_match"]}
+
+
+def vizwiz_vqa_doc_to_text(doc):
     text = f"{doc['question'].capitalize()}\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
     return text
 
 
-def vizwizvqa_aggreate_submissions(results):
+def vizwiz_vqa_aggreate_submissions(results):
     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
     os.makedirs("./submissions", exist_ok=True)
-    submission_file_name = f"./submissions/vizwizvqa-test-submission-{now_date_time}.json"
+    submission_file_name = f"./submissions/vizwiz_vqa-test-submission-{now_date_time}.json"
     path = os.path.abspath(submission_file_name)
     with open(path, "w") as f:
         json.dump(results, f)

diff --git a/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml b/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml
@@ -0,0 +1,14 @@
+group: vizwiz_vqa
+task: vizwiz_vqa_test
+test_split: test
+include: _default_template_vqa_yaml
+process_results: !function utils.vizwiz_vqa_process_results_test
+metric_list:
+  # - metric: exact_match
+  #   aggregation: mean
+  #   higher_is_better: true
+  #   ignore_case: true
+  #   ignore_punctuation: true
+  - metric: submission
+    aggregation: !function utils.vizwiz_vqa_aggreate_submissions
+    higher_is_better: true
diff --git a/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml b/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml
@@ -0,0 +1,14 @@
+group: vizwiz_vqa
+task: vizwiz_vqa_val
+test_split: val
+include: _default_template_vqa_yaml
+process_results: !function utils.vizwiz_vqa_process_results_val
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+  # - metric: submission
+  #   aggregation: !function utils.vizwiz_vqa_aggreate_submissions
+  #   higher_is_better: true
diff --git a/lmms_eval/tasks/vizwizvqa_test/vizwizvqa_test.yaml b/lmms_eval/tasks/vizwizvqa_test/vizwizvqa_test.yaml