reland D70126859 (#2787)

TroyGarden · facebook-github-bot · commit 35b14b000d24 · 2025-04-04T14:28:32.000-07:00
Summary: Pull Request resolved: #2787 # context * previous diff triggered S495021 * the error message is like ``` ModelGenerationPlatformError("AttributeError: '_EmbeddingBagProxy' object has no attribute 'weight'") ``` * This is because in some flow the EBC module is fx traced so there is no actual EBC but a Proxy. Without full context it's risky to push this change. * as a workaround, we'll just convert the unsharded EBC back to float32 so it's compatible with the input KJT.weight of float32 NOTE: this hacky change (unsharded EBC float16 ==> float32) is only needed in the tests, where we want to compare the results from sharded EBC. WARNING: We make a strong assumption here that in any unsharded EBC (with dtype=float16) use case, the input KJT.weights should never be float32. Reviewed By: basilwong Differential Revision: D70712348 fbshipit-source-id: f2abaa601adf3052ea322cf326363da8bfef96c3
diff --git a/torchrec/distributed/test_utils/test_model.py b/torchrec/distributed/test_utils/test_model.py
@@ -243,8 +243,7 @@ def _validate_pooling_factor(
             global_idlist_indices.append(indices)
             global_idlist_offsets.append(offsets)
 
-        for idx in range(len(idscore_ind_ranges)):
-            ind_range = idscore_ind_ranges[idx]
+        for idx, ind_range in enumerate(idscore_ind_ranges):
             lengths_ = torch.abs(
                 torch.randn(batch_size * world_size, device=device)
                 + (
diff --git a/torchrec/distributed/test_utils/test_model_parallel.py b/torchrec/distributed/test_utils/test_model_parallel.py
@@ -290,7 +290,7 @@ def test_sharding_rw(
         data_type=st.sampled_from([DataType.FP32, DataType.FP16]),
         # TODO - need to enable optimizer overlapped behavior for data_parallel tables
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=1, deadline=None)
+    @settings(verbosity=Verbosity.verbose, max_examples=2, deadline=None)
     def test_sharding_dp(
         self,
         sharder_type: str,
@@ -429,7 +429,7 @@ def test_sharding_cw(
         variable_batch_size=st.booleans(),
         data_type=st.sampled_from([DataType.FP32, DataType.FP16]),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=3, deadline=None)
+    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
     def test_sharding_twcw(
         self,
         sharder_type: str,
@@ -510,7 +510,7 @@ def test_sharding_twcw(
         variable_batch_size=st.booleans(),
         data_type=st.sampled_from([DataType.FP32, DataType.FP16]),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=3, deadline=None)
+    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
     def test_sharding_tw(
         self,
         sharder_type: str,
@@ -592,7 +592,7 @@ def test_sharding_tw(
         pooling=st.sampled_from([PoolingType.SUM, PoolingType.MEAN]),
         data_type=st.sampled_from([DataType.FP32, DataType.FP16]),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=6, deadline=None)
+    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
     def test_sharding_twrw(
         self,
         sharder_type: str,
diff --git a/torchrec/distributed/test_utils/test_sharding.py b/torchrec/distributed/test_utils/test_sharding.py
@@ -9,18 +9,7 @@
 
 import random
 from enum import Enum
-from typing import (
-    Any,
-    Callable,
-    cast,
-    Dict,
-    List,
-    Optional,
-    Protocol,
-    Tuple,
-    Type,
-    Union,
-)
+from typing import Any, cast, Dict, List, Optional, Protocol, Tuple, Type, Union
 
 import torch
 import torch.distributed as dist
@@ -59,7 +48,12 @@
     ShardingPlan,
     ShardingType,
 )
-from torchrec.modules.embedding_configs import BaseEmbeddingConfig, EmbeddingBagConfig
+from torchrec.modules.embedding_configs import (
+    BaseEmbeddingConfig,
+    DataType,
+    EmbeddingBagConfig,
+)
+from torchrec.modules.embedding_modules import EmbeddingBagCollection
 from torchrec.optim.keyed import CombinedOptimizer, KeyedOptimizerWrapper
 from torchrec.optim.optimizers import in_backward_optimizer_filter
 
@@ -329,6 +323,15 @@ def copy_state_dict(
             tensor.copy_(global_tensor)
 
 
+# alter the ebc dtype to float32 in-place.
+def alter_global_ebc_dtype(model: nn.Module) -> None:
+    for _name, ebc in model.named_modules():
+        if isinstance(ebc, EmbeddingBagCollection) and ebc._is_weighted:
+            with torch.no_grad():
+                for bag in ebc.embedding_bags.values():
+                    bag.weight = torch.nn.Parameter(bag.weight.float())
+
+
 def sharding_single_rank_test(
     rank: int,
     world_size: int,
@@ -527,6 +530,7 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
             global_model.state_dict(),
             exclude_predfix="sparse.pooled_embedding_arch.embedding_modules._itp_iter",
         )
+        alter_global_ebc_dtype(global_model)
 
         # Run a single training step of the sharded model.
         local_pred = gen_full_pred_after_one_step(
@@ -554,9 +558,7 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
             )
 
             # Compare predictions of sharded vs unsharded models.
-            if qcomms_config is None:
-                torch.testing.assert_close(global_pred, torch.cat(all_local_pred))
-            else:
+            if qcomms_config is not None:
                 # With quantized comms, we can relax constraints a bit
                 rtol = 0.003
                 if CommType.FP8 in [
@@ -568,6 +570,25 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
                 torch.testing.assert_close(
                     global_pred, torch.cat(all_local_pred), rtol=rtol, atol=atol
                 )
+            elif (
+                weighted_tables is not None
+                and weighted_tables[0].data_type == DataType.FP16
+            ):
+                # we relax this accuracy test because when the embedding table weights is FP16,
+                # the sharded EBC would upscale the precision to FP32 for the returned embedding
+                # KJT.weights (FP32) + sharded_EBC (FP16) ==> embeddings (FP32)
+                # the test uses the unsharded EBC for reference to compare the results, but the unsharded EBC
+                #  uses EmbeddingBags can only handle same precision, i.e.,
+                # KJT.weights (FP32) + unsharded_EBC (FP32) ==> embeddings (FP32)
+                # therefore, the discrepancy leads to a relaxed tol level.
+                torch.testing.assert_close(
+                    global_pred,
+                    torch.cat(all_local_pred),
+                    atol=1e-4,  # relaxed atol due to FP16 in weights
+                    rtol=1e-4,  # relaxed rtol due to FP16 in weights
+                )
+            else:
+                torch.testing.assert_close(global_pred, torch.cat(all_local_pred))
 
 
 def create_device_mesh_for_2D(