diff --git a/open_lm/train.py b/open_lm/train.py index af16c3de..83cd22d3 100644 --- a/open_lm/train.py +++ b/open_lm/train.py @@ -326,11 +326,6 @@ def train_one_epoch( # resetting batch / data time meters per log window batch_time_m.reset() data_time_m.reset() - # reset all average meters - losses_m.reset() - if averagers is not None and args.log_avg_model_training_loss: - for k in averagers.avgs_dict.keys(): - losses_avg_m[k].reset() if math.isnan(losses_m.val): # case where loss goes to nan, we see this sometimes with bad nodes. @@ -338,6 +333,12 @@ def train_one_epoch( # e.g., saving checkpoints and optmization states that may lead to skipped # training on restarts. return False, step + + # reset all average meters + losses_m.reset() + if averagers is not None and args.log_avg_model_training_loss: + for k in averagers.avgs_dict.keys(): + losses_avg_m[k].reset() # end for if tb_writer is not None: diff --git a/setup.py b/setup.py index c2402783..cb7fcd4a 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ def _read_reqs(relpath): setuptools.setup( name="open_lm", - version="0.0.33", + version="0.0.34", author=[ "Suchin Gururangan*", "Mitchell Wortsman*",