Xilinx · Giuseppe5 · Nov 21, 2024 · Nov 21, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/src/brevitas/core/scaling/standalone.py b/src/brevitas/core/scaling/standalone.py
@@ -372,6 +372,7 @@ def __init__(
         self.restrict_inplace_preprocess = restrict_scaling_impl.restrict_init_inplace_module()
         self.restrict_scaling_pre = restrict_scaling_impl.restrict_init_module()
         self.restrict_threshold_pre = restrict_threshold_impl.restrict_init_module()
+        self.init_done: bool = brevitas.jit.Attribute(False, bool)
 
     @brevitas.jit.script_method
     def training_forward(self, stats_input: Tensor, threshold: Tensor) -> Tensor:
@@ -411,22 +412,22 @@ def training_forward(self, stats_input: Tensor, threshold: Tensor) -> Tensor:
     def forward(self, stats_input: Tensor, threshold: Optional[Tensor] = None) -> Tensor:
         if threshold is None:
             threshold = torch.ones(1).type_as(stats_input)
-        if self.training:
+        if self.training and not self.init_done:
             # Threshold division handled inside the training_forward
             return self.training_forward(stats_input, threshold)
         else:
-            if self.counter <= self.collect_stats_steps:
-                out = self.buffer
+            if not self.init_done:
+                self.init_done = True
                 # No clamping is necessary since statistics are already clamped in training_forward
-                out = self.restrict_scaling_pre(out)
-            else:
-                out = self.value
+                self.restrict_inplace_preprocess(self.buffer)
+                inplace_tensor_mul(self.value.detach(), self.buffer)
+            out = self.value
             threshold = self.restrict_threshold(self.restrict_threshold_pre(threshold))
             out = self.restrict_scaling(out)
             out = out / threshold
             # We can clamp after restrict val since the learned parameter is already in log-domain
             out = abs_binary_sign_grad(self.clamp_scaling(out))
-        return out
+            return out
 
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         output_dict = super(ParameterFromRuntimeStatsScaling, self).state_dict(

diff --git a/tests/brevitas/export/test_torch_qcdq.py b/tests/brevitas/export/test_torch_qcdq.py
@@ -13,6 +13,7 @@
 
 @requires_pt_ge('1.9.1')
 @jit_disabled_for_export()
+@torch.no_grad()
 def test_torch_qcdq_wbiol_export(
         quant_module,
         quant_module_impl,
@@ -57,6 +58,7 @@ def test_torch_qcdq_wbiol_export(
 @requires_pt_ge('1.9.1')
 @jit_disabled_for_export()
 @parametrize('input_signed', [True, False])
+@torch.no_grad()
 def test_torch_qcdq_avgpool_export(input_signed, output_bit_width):
     in_size = (1, IN_CH, FEATURES, FEATURES)
     inp = torch.randn(in_size)