[Loggings] update logging logic (EvolvingLMMs-Lab#54)

* [Fix] rearrange location of init eval_logger * Ignore DeprecationWarnings in lmms_eval/__main__.py and lmms_eval/models/fuyu.py * Update lmms_eval/__main__.py and lmms_eval/utils.py * update * Update llava.py with LLaVA model imports and error handling * Add and test new datasets * update * Update wandb version and require report-editing:v0 * Add support for logging samples to Weights and Biases This commit adds a new command-line argument `--wandb_log_samples` to enable logging all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases. The `cli_evaluate` function has been modified to handle this new argument and log the samples if the argument is set to True. The `wandb_logger` object has also been updated to include a new method `log_eval_samples` for logging the samples. This enhancement improves the functionality of the LMMS evaluation tool. * update
kangreen0210 · Feb 25, 2024 · 4a1f43f · 4a1f43f
1 parent 8bb5f92
commit 4a1f43f
Show file tree

Hide file tree

Showing 12 changed files with 436 additions and 244 deletions.
diff --git a/README.md b/README.md
@@ -113,6 +113,8 @@ accelerate launch --num_processes=8 -m lmms_eval --config example_eval.yaml # Ea
 - Multi-DocVQA (multidocvqa)
 
 ## Datasets to be added and tested
+- TallyQA (tallyqa)
+- VSR (vsr)
 - Winoground (winoground)
 - NLVR2 (nlvr2)
 - RavenIQ-Test (raveniq)

diff --git a/demo.tape b/demo.tape
@@ -0,0 +1,17 @@
+# Where should we write the GIF?
+Output demo.gif
+
+# Set up a 1200x600 terminal with 46px font.
+Set FontSize 24
+Set Width 1440
+Set Height 2560
+Set WindowBar Colorful
+Set LoopOffset 5 # Start the GIF at the 5th frame
+Set Framerate 6
+Set TypingSpeed 15ms
+
+# Type a command in the terminal.
+Type "python -m accelerate.commands.launch --main_process_port=12350 --num_processes=8 lmms_eval --model=llava --model_args=pretrained=liuhaotian/llava-v1.5-7b --tasks=mme --limit=8 --batch_size=1 --log_samples --log_samples_suffix=demo --output_path=./logs/"
+Enter
+# Admire the output for a bit.
+Sleep 30
diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
@@ -7,11 +7,15 @@
 import argparse
 import numpy as np
 
+import warnings
+import traceback
+
+warnings.simplefilter("ignore", category=DeprecationWarning)
+
 from accelerate import Accelerator
 from pathlib import Path
 from typing import Union
 import hashlib
-import wandb
 
 from lmms_eval import evaluator, utils
 from lmms_eval.tasks import initialize_tasks, include_path, get_task_dict
@@ -88,6 +92,12 @@ def parse_eval_args() -> argparse.Namespace:
         default=False,
         help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
     )
+    parser.add_argument(
+        "--wandb_log_samples",
+        action="store_true",
+        default=False,
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases",
+    )
     parser.add_argument(
         "--log_samples_suffix",
         type=str,
@@ -145,6 +155,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         sys.exit(1)
 
     set_loggers(args)
+    eval_logger = logging.getLogger("lmms-eval")
+    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
+    eval_logger.info(f"Verbosity set to {args.verbosity}")
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
     args_list = []
     results_list = []
@@ -173,18 +187,21 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
 
     for args in args_list:
         try:
-            if is_main_process and args.wandb_args:
+            if is_main_process and args.wandb_args:  # thoughtfully we should only init wandb once, instead of multiple ranks to avoid network traffics and unwanted behaviors.
                 wandb_logger = WandbLogger(args)
-            results = cli_evaluate_single(args)
 
-            accelerator.wait_for_everyone()
-            if is_main_process and args.wandb_args and results is not None:
-                wandb_logger.log_eval_result(results)
-                if wandb_logger.online():
-                    wandb_logger.write_to_report()
-                wandb_logger.finish()
+            results, samples = cli_evaluate_single(args)
             results_list.append(results)
+
+            accelerator.wait_for_everyone()
+            if is_main_process:
+                wandb_logger.post_init(results)
+                wandb_logger.log_eval_result()
+                if args.wandb_log_samples and samples is not None:
+                    wandb_logger.log_eval_samples(samples)
+
         except Exception as e:
+            traceback.print_exc()
             eval_logger.error(f"Error during evaluation: {e}")
             results_list.append(None)
 
@@ -292,8 +309,8 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
                     filename.open("w").write(samples_dumped)
                     eval_logger.info(f"Saved samples to {filename}")
 
-        return results
-    return None
+        return results, samples
+    return None, None
 
 
 def print_results(args, results):
@@ -306,7 +323,7 @@ def print_results(args, results):
 def set_loggers(args):
     eval_logger = logging.getLogger("lmms-eval")
     ch = logging.StreamHandler()
-    formatter = PathFormatter("%(asctime)s [%(pathname)s:%(lineno)d] %(message)s", "%m-%d:%H:%M:%S", timezone=args.timezone)
+    formatter = PathFormatter("%(asctime)s [%(pathname)s:%(lineno)d] %(levelname)s %(message)s", "%m-%d %H:%M:%S", timezone=args.timezone)
     ch.setFormatter(formatter)
     eval_logger.addHandler(ch)