Support model buffers as pipeline postproc inputs (#2769)

che-sh · facebook-github-bot · commit 67169371b900 · 2025-03-04T01:02:22.000-08:00
Summary:


Torchrec rewriting logic got a bit hairy over the years, this sequence of changes aims to refactor the rewrite logic to be less convoluted and more maintainable in the future.

This change: Splits monolithic ArgInfoStep into multiple classes, each handling single potential operation (+minimum data necessary to perform it).

Internal

Diff stack navigation:
1. D69292525 and below - before refactoring
2. D69438143 - Refactor get_node_args and friends into a class 
3. D69461227 - refactor "joint lists" in ArgInfo into a list of ArgInfoStep
4. D69461226 - refactor `_build_args_kwargs` into instance methods on ArgInfo and ArgInfoStep 
5. D69461228 - split monolithic `ArgInfoStep` into a class hierarchy 
6. D69764721 - enable buffers as preproc arguments (**you are here**)

Differential Revision: D69764721
diff --git a/torchrec/distributed/test_utils/test_model.py b/torchrec/distributed/test_utils/test_model.py
@@ -1989,6 +1989,126 @@ def forward(self, kjt: KeyedJaggedTensor) -> List[KeyedJaggedTensor]:
         ]
 
 
+class TestPreprocForModelWithBuffer(nn.Module):
+    """
+    Basic module for testing
+
+    Args: None
+    Examples:
+        >>> TestPreprocForModelWithBuffer()
+    Returns:
+        List[KeyedJaggedTensor
+    """
+
+    def forward(
+        self, kjt: KeyedJaggedTensor, buffer: torch.Tensor
+    ) -> List[KeyedJaggedTensor]:
+        """
+        Selects 3 features from a specific KJT and concatenates
+        them with KJT derived from a given buffer
+        """
+        # split
+        jt_0 = kjt["feature_0"]
+        jt_1 = kjt["feature_1"]
+        jt_2 = kjt["feature_2"]
+
+        kjt_from_buffer = KeyedJaggedTensor.from_lengths_sync(
+            ["feature_0"],
+            buffer,
+            torch.ones(buffer.size(), dtype=torch.int32, device=buffer.device),
+        )
+
+        # merge only features 0,1,2, removing feature 3
+        kjt_projection = KeyedJaggedTensor.from_jt_dict(
+            {
+                "feature_0": jt_0,
+                "feature_1": jt_1,
+                "feature_2": jt_2,
+            }
+        )
+
+        return [
+            KeyedJaggedTensor.concat(
+                [
+                    kjt_projection,
+                    kjt_from_buffer,
+                ]
+            )
+        ]
+
+
+class TestModelWithBuffer(nn.Module):
+    """
+    Basic module that has a postproc that takes a buffer as input
+
+
+    Args:
+        tables,
+        weighted_tables,
+        device,
+        buffer_size,
+        num_float_features,
+
+    Example:
+        >>> TestModelWithBuffer(tables, weighted_tables, device, 100)
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]
+    """
+
+    def __init__(
+        self,
+        tables: List[EmbeddingBagConfig],
+        weighted_tables: List[EmbeddingBagConfig],
+        device: torch.device,
+        buffer_size: int,
+        num_float_features: int = 10,
+    ) -> None:
+        super().__init__()
+        self.dense = TestDenseArch(num_float_features, device)
+
+        self.ebc: EmbeddingBagCollection = EmbeddingBagCollection(
+            tables=tables,
+            device=device,
+        )
+        self.weighted_ebc = EmbeddingBagCollection(
+            tables=weighted_tables,
+            is_weighted=True,
+            device=device,
+        )
+        max_index = tables[0].num_embeddings
+        self._postproc_module = TestPreprocForModelWithBuffer()
+        self.register_buffer(
+            "_buffer",
+            torch.randint(0, max_index, (buffer_size,), device=device),
+            persistent=False,
+        )
+
+    def forward(
+        self,
+        input: ModelInput,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Runs preprco for EBC and weighted EBC, optionally runs postproc for input
+
+        Args:
+            input
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]
+        """
+        modified_input = input
+
+        modified_input.idlist_features = self._postproc_module(
+            modified_input.idlist_features, self._buffer
+        )
+
+        ebc_out = self.ebc(modified_input.idlist_features[0])
+        weighted_ebc_out = self.weighted_ebc(modified_input.idscore_features)
+
+        pred = torch.cat([ebc_out.values(), weighted_ebc_out.values()], dim=1)
+        return pred.sum(), pred
+
+
 class TestModelWithPreproc(nn.Module):
     """
     Basic module with up to 3 postproc modules:
diff --git a/torchrec/distributed/train_pipeline/tests/test_train_pipelines.py b/torchrec/distributed/train_pipeline/tests/test_train_pipelines.py
@@ -36,6 +36,7 @@
 from torchrec.distributed.test_utils.test_model import (
     ModelInput,
     TestEBCSharder,
+    TestModelWithBuffer,
     TestModelWithPreproc,
     TestModelWithPreprocCollectionArgs,
     TestNegSamplingModule,
@@ -1459,6 +1460,30 @@ def forward(
         self.assertEqual(len(pipeline._pipelined_modules), 2)
         self.assertEqual(len(pipeline._pipelined_postprocs), 1)
 
+    # pyre-ignore
+    @unittest.skipIf(
+        not torch.cuda.is_available(),
+        "Not enough GPUs, this test requires at least one GPU",
+    )
+    def test_postproc_with_buffer_arg(self) -> None:
+        """
+        If postproc module is nested, we should still be able to pipeline it
+        """
+        model = TestModelWithBuffer(
+            tables=self.tables[:-1],  # ignore last table as postproc will remove
+            weighted_tables=self.weighted_tables[:-1],  # ignore last table
+            device=self.device,
+            buffer_size=self.batch_size,
+        )
+        pipelined_model, pipeline = self._check_output_equal(
+            model,
+            self.sharding_type,
+        )
+
+        # Check that both EC and EBC pipelined
+        self.assertEqual(len(pipeline._pipelined_modules), 2)
+        self.assertEqual(len(pipeline._pipelined_postprocs), 1)
+
     # pyre-ignore
     @unittest.skipIf(
         not torch.cuda.is_available(),
@@ -1469,7 +1494,6 @@ def test_pipeline_postproc_with_collection_args(self) -> None:
         Exercises scenario when postproc module has an argument that is a list or dict
         with some elements being:
             * static scalars
-            * static tensors (e.g. torch.ones())
             * tensors derived from input batch (e.g. input.idlist_features["feature_0"])
             * tensors derived from input batch and other postproc module (e.g. other_postproc(input.idlist_features["feature_0"]))
         """
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -270,6 +270,30 @@ def process(self, arg) -> Any:
         }
 
 
+class ModuleAttributeArgInfoStep(BaseArgInfoStep):
+    def __init__(self, module: torch.nn.Module, fqn: str) -> None:
+        super().__init__()
+        self.module = module
+        self.fqn = fqn
+
+    @classmethod
+    def validate(cls, module: torch.nn.Module, fqn: str) -> None:
+        fqn_parts = fqn.split(".")
+        current = module
+        for step in fqn_parts:
+            if not hasattr(current, step):
+                raise ValueError(f"Module {module} does not have attribute {fqn}")
+            current = getattr(current, step)
+
+    # pyre-ignore
+    def process(self, _arg) -> Any:
+        fqn_parts = self.fqn.split(".")
+        current = self.module
+        for step in fqn_parts:
+            current = getattr(current, step)
+        return current
+
+
 class ArgInfoStepFactory:
     """
     Convenience class to reduce the amount of imports the external uses will have.
@@ -306,6 +330,13 @@ def from_list(cls, value: List[object]) -> ListArgInfoStep:
     def from_dict(cls, value: Dict[str, object]) -> DictArgInfoStep:
         return DictArgInfoStep(value)
 
+    @classmethod
+    def from_module_attr(
+        cls, module: torch.nn.Module, fqn: str
+    ) -> ModuleAttributeArgInfoStep:
+        ModuleAttributeArgInfoStep.validate(module, fqn)
+        return ModuleAttributeArgInfoStep(module, fqn)
+
 
 @dataclass
 class ArgInfo:
@@ -1134,6 +1165,19 @@ def _handle_placeholder(
             arg_info.add_step(ArgInfoStepFactory.noop())
         return arg_info
 
+    def _handle_module_get_attr(
+        self,
+        fqn: str,
+        arg_info: ArgInfo,
+    ) -> ArgInfo:
+        # get_attr calls always carry FQN from model root
+        # NOTE: the first argument essentially creates a "closure" over the model
+        # so things might get hairy if the model kept in self._model is
+        # later discarded; however so far no training pipeline do that.
+        step = ArgInfoStepFactory.from_module_attr(self._model, fqn)
+        arg_info.add_step(step)
+        return arg_info
+
     def _handle_module(
         self, child_node: torch.fx.Node, arg_info: ArgInfo
     ) -> Optional[ArgInfo]:
@@ -1228,7 +1272,11 @@ def _get_node_args_helper_inner(
             elif child_node.op == "call_module":
                 return self._handle_module(arg, arg_info)
             elif (
-                child_node.op == "call_function"
+                child_node.op == "get_attr":
+                # pyre-fixme[9]: arg.target is a fqn string for get_attr op
+                module_fqn: str = arg.target
+                return self._handle_module_get_attr(module_fqn, arg_info)
+            elif child_node.op == "call_function"
                 and child_node.target.__module__ == "builtins"
                 # pyre-fixme[16]
                 and child_node.target.__name__ == "getattr"