Support Hybrid Sharding for DI (#1786)

gnahzg · facebook-github-bot · commit cb6b69aee4d1 · 2024-03-13T09:58:48.000-07:00
Summary: Pull Request resolved: #1786 Miminum change to support hybrid sharding for DI Context: https://docs.google.com/document/d/1Y0H1TntfZkW5Cgw0_B_gydC9qPIQtpirchVtzSADid8/edit#heading=h.z2j5qijdvagp TLDR: DI need to sharding table from the same EC in a way that some table goes to CPU, some tables go to GPU. Currently we only support all host/devices as a whole env. Below changes enable sharding by device group. Most implementation copy from D54570308 with adjustment in sharding according to device instead of sharding type TODO: generialize support for hybrid sharding by (1) Explicitly supporting creating sharding plan with different world_size for different device group (2) Clean up code (3) Support generate sharding plan by device group, merge sharding plan Reviewed By: IvanKobzarev Differential Revision: D54805360 fbshipit-source-id: d7b8c7e0232d6de457fc991f55c4c6bd3d53812c
diff --git a/torchrec/distributed/quant_embedding.py b/torchrec/distributed/quant_embedding.py
@@ -10,7 +10,7 @@
 
 from collections import defaultdict, deque
 from dataclasses import dataclass
-from typing import Any, cast, Dict, List, Optional, Tuple, Type
+from typing import Any, cast, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 from fbgemm_gpu.split_table_batched_embeddings_ops_inference import (
@@ -80,6 +80,24 @@ def record_stream(self, stream: torch.cuda.streams.Stream) -> None:
             ctx.record_stream(stream)
 
 
+def get_device_from_parameter_sharding(ps: ParameterSharding) -> str:
+    # pyre-ignore
+    return ps.sharding_spec.shards[0].placement.device().type
+
+
+def get_device_from_sharding_type(
+    emb_shard_infos: List[EmbeddingShardingInfo],
+) -> str:
+    res = list(
+        {
+            get_device_from_parameter_sharding(ps.param_sharding)
+            for ps in emb_shard_infos
+        }
+    )
+    assert len(res) == 1, "All shards should be on the same type of device"
+    return res[0]
+
+
 def create_infer_embedding_sharding(
     sharding_type: str,
     sharding_infos: List[EmbeddingShardingInfo],
@@ -336,19 +354,25 @@ def __init__(
         self,
         module: QuantEmbeddingCollection,
         table_name_to_parameter_sharding: Dict[str, ParameterSharding],
-        env: ShardingEnv,
+        # TODO: Consolidate to use Dict[str, ShardingEnv]
+        env: Union[
+            ShardingEnv, Dict[str, ShardingEnv]
+        ],  # Support hybrid sharding for DI
         fused_params: Optional[Dict[str, Any]] = None,
         device: Optional[torch.device] = None,
     ) -> None:
         super().__init__()
 
         self._embedding_configs: List[EmbeddingConfig] = module.embedding_configs()
 
+        self._is_hybrid_sharding: bool = isinstance(env, Dict)
+
         self._sharding_type_to_sharding_infos: Dict[
             str, List[EmbeddingShardingInfo]
         ] = create_sharding_infos_by_sharding(
             module, table_name_to_parameter_sharding, fused_params
         )
+
         self._sharding_type_to_sharding: Dict[
             str,
             EmbeddingSharding[
@@ -359,7 +383,14 @@ def __init__(
             ],
         ] = {
             sharding_type: create_infer_embedding_sharding(
-                sharding_type, embedding_confings, env
+                sharding_type,
+                embedding_confings,
+                (
+                    env
+                    if not self._is_hybrid_sharding
+                    # pyre-ignore
+                    else env[get_device_from_sharding_type(embedding_confings)]
+                ),
             )
             for sharding_type, embedding_confings in self._sharding_type_to_sharding_infos.items()
         }
@@ -732,7 +763,7 @@ def shard(
         self,
         module: QuantEmbeddingCollection,
         params: Dict[str, ParameterSharding],
-        env: ShardingEnv,
+        env: Union[ShardingEnv, Dict[str, ShardingEnv]],
         device: Optional[torch.device] = None,
     ) -> ShardedQuantEmbeddingCollection:
         fused_params = self.fused_params if self.fused_params else {}
diff --git a/torchrec/distributed/shard.py b/torchrec/distributed/shard.py
@@ -189,7 +189,10 @@ def init_weights(m):
 
 def _shard_modules(  # noqa: C901
     module: nn.Module,
-    env: Optional[ShardingEnv] = None,
+    # TODO: Consolidate to using Dict[str, ShardingEnv]
+    env: Optional[
+        Union[ShardingEnv, Dict[str, ShardingEnv]]
+    ] = None,  # Support hybrid sharding
     device: Optional[torch.device] = None,
     plan: Optional[ShardingPlan] = None,
     sharders: Optional[List[ModuleSharder[torch.nn.Module]]] = None,
@@ -220,6 +223,9 @@ def _shard_modules(  # noqa: C901
     }
 
     if plan is None:
+        assert isinstance(
+            env, ShardingEnv
+        ), "Currently hybrid sharding only support use manual sharding plan"
         planner = EmbeddingShardingPlanner(
             topology=Topology(
                 local_world_size=get_local_size(env.world_size),
diff --git a/torchrec/distributed/test_utils/infer_utils.py b/torchrec/distributed/test_utils/infer_utils.py
@@ -382,7 +382,7 @@ def shard(
         self,
         module: QuantEmbeddingCollection,
         params: Dict[str, ParameterSharding],
-        env: ShardingEnv,
+        env: Union[Dict[str, ShardingEnv], ShardingEnv],
         device: Optional[torch.device] = None,
     ) -> ShardedQuantEmbeddingCollection:
         fused_params = self.fused_params if self.fused_params else {}
diff --git a/torchrec/distributed/tests/test_infer_hetero_shardings.py b/torchrec/distributed/tests/test_infer_hetero_shardings.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+import unittest
+
+import torch
+from torchrec import EmbeddingCollection, EmbeddingConfig
+from torchrec.distributed.quant_embedding import QuantEmbeddingCollectionSharder
+from torchrec.distributed.shard import _shard_modules
+from torchrec.distributed.sharding_plan import (
+    construct_module_sharding_plan,
+    row_wise,
+    table_wise,
+)
+from torchrec.distributed.test_utils.infer_utils import KJTInputWrapper, quantize
+from torchrec.distributed.types import ShardingEnv, ShardingPlan
+
+
+class InferHeteroShardingsTest(unittest.TestCase):
+    # pyre-ignore
+    @unittest.skipIf(
+        torch.cuda.device_count() <= 3,
+        "Not enough GPUs available",
+    )
+    def test_sharder_different_world_sizes(self) -> None:
+        num_embeddings = 10
+        emb_dim = 16
+        world_size = 2
+        local_size = 1
+        tables = [
+            EmbeddingConfig(
+                num_embeddings=num_embeddings,
+                embedding_dim=emb_dim,
+                name=f"table_{i}",
+                feature_names=[f"feature_{i}"],
+            )
+            for i in range(3)
+        ]
+        model = KJTInputWrapper(
+            module_kjt_input=torch.nn.Sequential(
+                EmbeddingCollection(
+                    tables=tables,
+                    device=torch.device("cpu"),
+                )
+            )
+        )
+        non_sharded_model = quantize(
+            model,
+            inplace=False,
+            quant_state_dict_split_scale_bias=True,
+            weight_dtype=torch.qint8,
+        )
+        sharder = QuantEmbeddingCollectionSharder()
+        module_plan = construct_module_sharding_plan(
+            non_sharded_model._module_kjt_input[0],
+            per_param_sharding={
+                "table_0": row_wise(([20, 10, 100], "cpu")),
+                "table_1": table_wise(rank=0, device="cuda"),
+                "table_2": table_wise(rank=1, device="cuda"),
+            },
+            # pyre-ignore
+            sharder=sharder,
+            local_size=local_size,
+            world_size=world_size,
+        )
+        plan = ShardingPlan(plan={"_module_kjt_input.0": module_plan})
+        env_dict = {
+            "cpu": ShardingEnv.from_local(
+                3,
+                0,
+            ),
+            "cuda": ShardingEnv.from_local(
+                2,
+                0,
+            ),
+        }
+        sharded_model = _shard_modules(
+            module=non_sharded_model,
+            # pyre-ignore
+            sharders=[sharder],
+            device=torch.device("cpu"),
+            plan=plan,
+            env=env_dict,
+        )
+        self.assertTrue(hasattr(sharded_model._module_kjt_input[0], "_lookups"))
+        self.assertTrue(len(sharded_model._module_kjt_input[0]._lookups) == 2)
+        for i, env in enumerate(env_dict.values()):
+            self.assertTrue(
+                hasattr(
+                    sharded_model._module_kjt_input[0]._lookups[i],
+                    "_embedding_lookups_per_rank",
+                )
+            )
+            self.assertTrue(
+                len(
+                    sharded_model._module_kjt_input[0]
+                    ._lookups[i]
+                    ._embedding_lookups_per_rank
+                )
+                == env.world_size
+            )