Skip to content

Commit

Permalink
Investigating
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelbenayoun committed Nov 19, 2024
1 parent 02c331d commit 23d2d5c
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion optimum/neuron/trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ def _reduce_loss(self, tr_loss: torch.Tensor) -> torch.Tensor:
# It works even for PP because under PP we make it so that the main process to log for callbacks is
# the one on dp_rank = tp_rank = 0 and pp_rank = pp_size -1.
reduced_tr_loss = xm.all_reduce(xm.REDUCE_SUM, tr_loss_div, groups=get_data_parallel_group(as_list=True))
print(reduced_tr_loss, xm.get_ordinal())
else:
reduced_tr_loss = xm.all_reduce(xm.REDUCE_SUM, tr_loss_div)

Expand All @@ -498,7 +499,8 @@ def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, igno
if self.control.should_log:
with torch.no_grad():
if isinstance(getattr(self, "_zero_loss_value"), torch.Tensor):
tr_loss.data = self._zero_loss_value.data
# tr_loss.data = self._zero_loss_value.data
tr_loss.zero_()
else:
tr_loss.zero_()

Expand Down

0 comments on commit 23d2d5c

Please sign in to comment.