Skip to content

Commit

Permalink
[Loggings] update logging logic (EvolvingLMMs-Lab#54)
Browse files Browse the repository at this point in the history
* [Fix] rearrange location of init eval_logger

* Ignore DeprecationWarnings in lmms_eval/__main__.py and lmms_eval/models/fuyu.py

* Update lmms_eval/__main__.py and lmms_eval/utils.py

* update

* Update llava.py with LLaVA model imports and error handling

* Add and test new datasets

* update

* Update wandb version and require report-editing:v0

* Add support for logging samples to Weights and Biases

This commit adds a new command-line argument `--wandb_log_samples` to enable logging all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases. The `cli_evaluate` function has been modified to handle this new argument and log the samples if the argument is set to True. The `wandb_logger` object has also been updated to include a new method `log_eval_samples` for logging the samples. This enhancement improves the functionality of the LMMS evaluation tool.

* update
  • Loading branch information
Luodian authored Feb 25, 2024
1 parent 8bb5f92 commit 4a1f43f
Show file tree
Hide file tree
Showing 12 changed files with 436 additions and 244 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ accelerate launch --num_processes=8 -m lmms_eval --config example_eval.yaml # Ea
- Multi-DocVQA (multidocvqa)

## Datasets to be added and tested
- TallyQA (tallyqa)
- VSR (vsr)
- Winoground (winoground)
- NLVR2 (nlvr2)
- RavenIQ-Test (raveniq)
Expand Down
17 changes: 17 additions & 0 deletions demo.tape
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Where should we write the GIF?
Output demo.gif

# Set up a 1200x600 terminal with 46px font.
Set FontSize 24
Set Width 1440
Set Height 2560
Set WindowBar Colorful
Set LoopOffset 5 # Start the GIF at the 5th frame
Set Framerate 6
Set TypingSpeed 15ms

# Type a command in the terminal.
Type "python -m accelerate.commands.launch --main_process_port=12350 --num_processes=8 lmms_eval --model=llava --model_args=pretrained=liuhaotian/llava-v1.5-7b --tasks=mme --limit=8 --batch_size=1 --log_samples --log_samples_suffix=demo --output_path=./logs/"
Enter
# Admire the output for a bit.
Sleep 30
41 changes: 29 additions & 12 deletions lmms_eval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@
import argparse
import numpy as np

import warnings
import traceback

warnings.simplefilter("ignore", category=DeprecationWarning)

from accelerate import Accelerator
from pathlib import Path
from typing import Union
import hashlib
import wandb

from lmms_eval import evaluator, utils
from lmms_eval.tasks import initialize_tasks, include_path, get_task_dict
Expand Down Expand Up @@ -88,6 +92,12 @@ def parse_eval_args() -> argparse.Namespace:
default=False,
help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
)
parser.add_argument(
"--wandb_log_samples",
action="store_true",
default=False,
help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases",
)
parser.add_argument(
"--log_samples_suffix",
type=str,
Expand Down Expand Up @@ -145,6 +155,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
sys.exit(1)

set_loggers(args)
eval_logger = logging.getLogger("lmms-eval")
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

args_list = []
results_list = []
Expand Down Expand Up @@ -173,18 +187,21 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:

for args in args_list:
try:
if is_main_process and args.wandb_args:
if is_main_process and args.wandb_args: # thoughtfully we should only init wandb once, instead of multiple ranks to avoid network traffics and unwanted behaviors.
wandb_logger = WandbLogger(args)
results = cli_evaluate_single(args)

accelerator.wait_for_everyone()
if is_main_process and args.wandb_args and results is not None:
wandb_logger.log_eval_result(results)
if wandb_logger.online():
wandb_logger.write_to_report()
wandb_logger.finish()
results, samples = cli_evaluate_single(args)
results_list.append(results)

accelerator.wait_for_everyone()
if is_main_process:
wandb_logger.post_init(results)
wandb_logger.log_eval_result()
if args.wandb_log_samples and samples is not None:
wandb_logger.log_eval_samples(samples)

except Exception as e:
traceback.print_exc()
eval_logger.error(f"Error during evaluation: {e}")
results_list.append(None)

Expand Down Expand Up @@ -292,8 +309,8 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
filename.open("w").write(samples_dumped)
eval_logger.info(f"Saved samples to {filename}")

return results
return None
return results, samples
return None, None


def print_results(args, results):
Expand All @@ -306,7 +323,7 @@ def print_results(args, results):
def set_loggers(args):
eval_logger = logging.getLogger("lmms-eval")
ch = logging.StreamHandler()
formatter = PathFormatter("%(asctime)s [%(pathname)s:%(lineno)d] %(message)s", "%m-%d:%H:%M:%S", timezone=args.timezone)
formatter = PathFormatter("%(asctime)s [%(pathname)s:%(lineno)d] %(levelname)s %(message)s", "%m-%d %H:%M:%S", timezone=args.timezone)
ch.setFormatter(formatter)
eval_logger.addHandler(ch)

Expand Down
Loading

0 comments on commit 4a1f43f

Please sign in to comment.