From b864d15957811dae0886f7c6f444e882590f7d39 Mon Sep 17 00:00:00 2001 From: GeorgiosSmyrnis Date: Tue, 14 May 2024 10:21:22 -0500 Subject: [PATCH] Improve error message. (#275) * Improve error message. * Formatting. --- open_lm/main.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/open_lm/main.py b/open_lm/main.py index f31c1224..abe56ba8 100644 --- a/open_lm/main.py +++ b/open_lm/main.py @@ -843,10 +843,14 @@ def main(args): expected_steps = data["train"].dataloader.num_batches if steps_done_epoch < (1 - args.data_tolerate_error_p) * expected_steps and not done_training: num_ckpt_too_few_tokens += 1 + if is_master(args): + logging.warning( + f"Epoch {epoch}, tokens seen: {steps_done_epoch * args.global_batch_size * args.seq_len}, tokens expected: {expected_steps * args.global_batch_size * args.seq_len}, ratio: {steps_done_epoch / expected_steps}" + ) if num_ckpt_too_few_tokens > args.data_tolerate_num_ckpts: raise RuntimeError( - f"{num_ckpt_too_few_tokens} checkpoints happened where the number of tokens seen was less than {1 - args.data_tolerate_error_p} of expected. This is likely due to transient errors e.g. reading from S3." + f"{num_ckpt_too_few_tokens} checkpoints happened where the number of tokens seen was {1 - args.data_tolerate_error_p} of expected. This is likely due to transient errors e.g. reading from S3." ) epoch = epoch + 1