From 23d2d5ce3affbacbe7bf143c82dd2b76e42590af Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Tue, 19 Nov 2024 15:37:49 +0100 Subject: [PATCH] Investigating --- optimum/neuron/trainers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py index 25edb1102..59b74a1cd 100755 --- a/optimum/neuron/trainers.py +++ b/optimum/neuron/trainers.py @@ -484,6 +484,7 @@ def _reduce_loss(self, tr_loss: torch.Tensor) -> torch.Tensor: # It works even for PP because under PP we make it so that the main process to log for callbacks is # the one on dp_rank = tp_rank = 0 and pp_rank = pp_size -1. reduced_tr_loss = xm.all_reduce(xm.REDUCE_SUM, tr_loss_div, groups=get_data_parallel_group(as_list=True)) + print(reduced_tr_loss, xm.get_ordinal()) else: reduced_tr_loss = xm.all_reduce(xm.REDUCE_SUM, tr_loss_div) @@ -498,7 +499,8 @@ def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, igno if self.control.should_log: with torch.no_grad(): if isinstance(getattr(self, "_zero_loss_value"), torch.Tensor): - tr_loss.data = self._zero_loss_value.data + # tr_loss.data = self._zero_loss_value.data + tr_loss.zero_() else: tr_loss.zero_()