Skip to content

Commit

Permalink
Fix order of loss resets.
Browse files Browse the repository at this point in the history
  • Loading branch information
GeorgiosSmyrnis committed May 8, 2024
1 parent 87955f5 commit dbc511d
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 6 deletions.
11 changes: 6 additions & 5 deletions open_lm/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,18 +326,19 @@ def train_one_epoch(
# resetting batch / data time meters per log window
batch_time_m.reset()
data_time_m.reset()
# reset all average meters
losses_m.reset()
if averagers is not None and args.log_avg_model_training_loss:
for k in averagers.avgs_dict.keys():
losses_avg_m[k].reset()

if math.isnan(losses_m.val):
# case where loss goes to nan, we see this sometimes with bad nodes.
# in this case we would like to free resources and prevent other issues
# e.g., saving checkpoints and optmization states that may lead to skipped
# training on restarts.
return False, step

# reset all average meters
losses_m.reset()
if averagers is not None and args.log_avg_model_training_loss:
for k in averagers.avgs_dict.keys():
losses_avg_m[k].reset()

# end for
if tb_writer is not None:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def _read_reqs(relpath):

setuptools.setup(
name="open_lm",
version="0.0.33",
version="0.0.34",
author=[
"Suchin Gururangan*",
"Mitchell Wortsman*",
Expand Down

0 comments on commit dbc511d

Please sign in to comment.