NVIDIA-Merlin · edknv · Feb 4, 2023 · Feb 7, 2023 · Feb 7, 2023 · Feb 8, 2023
diff --git a/examples/usecases/multi-gpu/install_distributed_embeddings.sh b/examples/usecases/multi-gpu/install_distributed_embeddings.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -e
+
+INSTALL_DIR=$1
+
+WORK_DIR=$(pwd)
+
+cd $INSTALL_DIR
+
+if [ ! -d "distributed-embeddings" ]; then
+  git clone https://github.com/NVIDIA-Merlin/distributed-embeddings.git
+fi
+
+cd distributed-embeddings
+
+git submodule update --init --recursive
+make pip_pkg
+python -m pip install --force-reinstall artifacts/*.whl
+python setup.py install
+
+cd $WORK_DIR
+
+python -c "import distributed_embeddings"
diff --git a/merlin/models/tf/__init__.py b/merlin/models/tf/__init__.py
@@ -127,7 +127,10 @@
 from merlin.models.tf.prediction_tasks.multi import PredictionTasks
 from merlin.models.tf.prediction_tasks.regression import RegressionTask
 from merlin.models.tf.prediction_tasks.retrieval import ItemRetrievalTask
-from merlin.models.utils.dependencies import is_transformers_available
+from merlin.models.utils.dependencies import (
+    is_distributed_embeddings_available,
+    is_transformers_available,
+)
 
 if is_transformers_available():
     from merlin.models.tf.transformers.block import (
@@ -145,6 +148,9 @@
         LastHiddenStateAndAttention,
     )
 
+if is_distributed_embeddings_available():
+    from merlin.models.tf.distributed.embedding import DistributedEmbeddings
+
 from merlin.models.tf.transforms.features import (
     BroadcastToSequence,
     CategoryEncoding,

diff --git a/merlin/models/tf/distributed/backend.py b/merlin/models/tf/distributed/backend.py
@@ -1,13 +1,22 @@
 hvd = None
 hvd_installed = False
 
+dmp = None
+dmp_installed = False
+
 try:
     import horovod.tensorflow.keras as hvd  # noqa: F401
 
     hvd_installed = True
 except ImportError:
     pass
 
+try:
+    from distributed_embeddings.python.layers import dist_model_parallel as dmp  # noqa: F401
+
+    dmp_installed = True
+except ImportError:
+    pass
 
 if hvd_installed:
     hvd.init()
diff --git a/merlin/models/tf/distributed/embedding.py b/merlin/models/tf/distributed/embedding.py
@@ -0,0 +1,155 @@
+from typing import Dict, List, Optional, Union
+
+import tensorflow as tf
+
+from merlin.models.tf.core.tabular import TabularBlock
+from merlin.models.tf.distributed.backend import dmp, dmp_installed, hvd_installed
+from merlin.models.utils.schema_utils import infer_embedding_dim
+from merlin.schema import Schema
+
+
+@tf.keras.utils.register_keras_serializable(package="merlin.models")
+class DistributedEmbeddings(TabularBlock):
+    """Large embedding table that automatically distributes embedding tables
+    to multiple GPUs.
+
+    Parameters
+    ----------
+    schema: Schema
+        Schema containing the columns used in embedding tables.
+    dim: Optional[Union[Dict[str, int], int]], optional
+        If int, the embedding size to use for all features, or a
+        dictionary-like {"feature_name": embedding size, ...}.
+        By default, None.
+    strategy: str
+        Indicates how embedding tables are distributed.
+        One of ["basic", "memory_balanced"]. Default: "basic".
+    column_slice_threshold: Optional[int]
+        Desired upper bound of element count in each slice.
+    dp_input: bool
+        If True, takes data-parallel input in shape [local_batch_size x global_num_embeddings].
+        Otherwise takes model-parallel input in shape [global_batch_size x local_num_embeddings].
+        Default: true.
+    input_table_map: Optional[List[int]]
+        A list with same length as inputs.  Maps `input[i]` to `table[input_table_map[i]]`.
+        If None, `input[i]` maps to `table[i]`. Default: None.
+    """
+
+    def __init__(
+        self,
+        schema: Schema,
+        dim: Optional[Union[Dict[str, int], int]] = None,
+        strategy: str = "basic",
+        column_slice_threshold: Optional[int] = None,
+        dp_input: bool = True,
+        input_table_map: Optional[List[int]] = None,
+        **kwargs,
+    ):
+        if not hvd_installed or not dmp_installed:
+            raise ImportError(
+                "'horovod' and 'distributed-embeddings' are required to use "
+                f"{self.__class__.__name__}."
+            )
+
+        super(DistributedEmbeddings, self).__init__(schema=schema, **kwargs)
+
+        self.dim = dim
+        self.table_names = []
+        self.embedding_layers = []
+
+        for col in self.schema:
+            table_name = col.int_domain.name or col.name
+            self.table_names.append(table_name)
+            self.embedding_layers.append(
+                tf.keras.layers.Embedding(
+                    input_dim=self._infer_input_dim(col),
+                    output_dim=self._infer_output_dim(col, dim),
+                    name=table_name,
+                )
+            )
+
+        self.embedding_layers = dmp.DistributedEmbedding(
+            self.embedding_layers,
+            strategy=strategy,
+            column_slice_threshold=column_slice_threshold,
+            dp_input=dp_input,
+            input_table_map=input_table_map,
+        )
+
+    def _infer_input_dim(self, col_schema):
+        return col_schema.int_domain.max + 1
+
+    def _infer_output_dim(self, col_schema, embedding_dims):
+        if isinstance(embedding_dims, dict):
+            dim = embedding_dims.get(col_schema.name)
+        elif isinstance(embedding_dims, int):
+            dim = embedding_dims
+        else:
+            dim = None
+
+        if dim is None:
+            dim = infer_embedding_dim(col_schema)
+
+        return dim
+
+    def build(self, input_shapes):
+        super().build(input_shapes)
+
+        if self.embedding_layers.built is True:
+            return
+
+        if isinstance(input_shapes, dict):
+            ordered_input_shapes = []
+            for feature_name in self.table_names:
+                ordered_input_shapes.append(input_shapes[feature_name])
+        elif isinstance(input_shapes, list):
+            ordered_input_shapes = input_shapes
+        else:
+            raise ValueError(f"Unexpected input type encountered: {input_shapes}")
+        self.embedding_layers.build(ordered_input_shapes)
+
+    @tf.function
+    def call(
+        self, inputs: Union[Dict[str, tf.Tensor], List[tf.Tensor]]
+    ) -> Union[Dict[str, tf.Tensor], List[tf.Tensor]]:
+        """
+        Parameters
+        ----------
+        inputs : Union[Dict[str, tf.Tensor], List[tf.Tensor]]
+            Tensors or dictionary of tensors representing the input batch.
+
+        Returns
+        -------
+        A tensor or dict of tensors corresponding to the embeddings for inputs
+        """
+
+        if isinstance(inputs, dict):
+            ordered_inputs = []
+            outputs = {}
+            for feature_name in self.table_names:
+                ordered_inputs.append(inputs[feature_name])
+            ordered_outputs = self.embedding_layers(ordered_inputs)
+            for feature_name, output in zip(self.schema.column_names, ordered_outputs):
+                outputs[feature_name] = output
+        elif isinstance(inputs, list):
+            outputs = self.embedding_layers(inputs)
+        else:
+            raise ValueError(f"Unexpected input type encountered: {inputs}")
+
+        return outputs
+
+    @tf.function
+    def compute_call_output_shape(self, input_shapes):
+        def _get_output_shape(input_shape):
+            batch_size = input_shape[0]
+            output_shape = tf.TensorShape([batch_size, self.dim])
+            return output_shape
+
+        if isinstance(input_shapes, dict):
+            output_shapes = {k: _get_output_shape(v) for k, v in input_shapes.items()}
+        elif isinstance(input_shapes, list):
+            output_shapes = [_get_output_shape(x) for x in input_shapes]
+        else:
+            raise ValueError(f"Unexpected input type encountered: {input_shapes}")
+
+        return output_shapes
diff --git a/merlin/models/utils/dependencies.py b/merlin/models/utils/dependencies.py
@@ -47,3 +47,13 @@ def is_transformers_available() -> bool:
     except ImportError:
         transformers = None
     return transformers is not None
+
+
+def is_distributed_embeddings_available() -> bool:
+    try:
+        import horovod  # isort: skip
+        import distributed_embeddings
+    except ImportError:
+        horovod = None
+        distributed_embeddings = None
+    return horovod is not None and distributed_embeddings is not None
diff --git a/tests/unit/tf/horovod/test_embedding.py b/tests/unit/tf/horovod/test_embedding.py
@@ -0,0 +1,81 @@
+import numpy as np
+import pytest
+import tensorflow as tf
+
+import merlin.models.tf as mm
+from merlin.schema import ColumnSchema, Schema, Tags
+
+hvd = pytest.importorskip("horovod.tensorflow.keras")
+dmp = pytest.importorskip("distributed_embeddings.python.layers.dist_model_parallel")
+
+
+def generate_inputs(input_dims, global_batch_size):
+    global_inputs = [
+        tf.random.uniform(shape=[global_batch_size], minval=0, maxval=dim, dtype=tf.int64)
+        for dim in input_dims
+    ]
+    for t in global_inputs:
+        hvd.broadcast(t, root_rank=0)
+    local_batch_size = global_batch_size // hvd.size()
+    rank = hvd.rank()
+    inputs = [t[rank * local_batch_size : (rank + 1) * local_batch_size] for t in global_inputs]
+    return inputs
+
+
+def test_distributed_embeddings_basic(embedding_dim=4, global_batch_size=8):
+    column_schema_0 = ColumnSchema(
+        "col0",
+        dtype=np.int32,
+        properties={"domain": {"min": 0, "max": 10, "name": "col0"}},
+        tags=[Tags.CATEGORICAL],
+    )
+    column_schema_1 = ColumnSchema(
+        "col1",
+        dtype=np.int32,
+        properties={"domain": {"min": 0, "max": 20, "name": "col1"}},
+        tags=[Tags.CATEGORICAL],
+    )
+    schema = Schema([column_schema_0, column_schema_1])
+
+    inputs = generate_inputs([10, 20], global_batch_size)
+    table = mm.DistributedEmbeddings(schema, embedding_dim)
+    outputs = table(inputs)
+
+    assert len(outputs) == 2
+    assert outputs[0].shape == (global_batch_size // hvd.size(), embedding_dim)
+    assert outputs[1].shape == (global_batch_size // hvd.size(), embedding_dim)
+
+
+@pytest.mark.parametrize("run_eagerly", [True, False])
+def test_dlrm_model_with_embeddings(
+    music_streaming_data, run_eagerly, batch_size=8, embedding_dim=16, learning_rate=0.03
+):
+    music_streaming_data.schema = music_streaming_data.schema.select_by_name(
+        ["item_id", "user_id", "user_age", "click"]
+    )
+    train = music_streaming_data.repartition(npartitions=hvd.size())
+    train_loader = mm.Loader(
+        train,
+        schema=train.schema,
+        batch_size=batch_size,
+        shuffle=True,
+        drop_last=True,
+    )
+
+    target_column = train.schema.select_by_tag(Tags.TARGET).column_names[0]
+
+    model = mm.DLRMModel(
+        train.schema,
+        embeddings=mm.DistributedEmbeddings(
+            train.schema.select_by_tag(Tags.CATEGORICAL), dim=embedding_dim
+        ),
+        bottom_block=mm.MLPBlock([32, embedding_dim]),
+        top_block=mm.MLPBlock([32, embedding_dim]),
+        prediction_tasks=mm.BinaryClassificationTask(target_column),
+    )
+
+    opt = tf.keras.optimizers.Adagrad(learning_rate=learning_rate)
+    model.compile(optimizer=opt, run_eagerly=run_eagerly, metrics=[tf.keras.metrics.AUC()])
+
+    losses = model.fit(train_loader, epochs=2)
+    assert all(measure >= 0 for metric in losses.history for measure in losses.history[metric])
diff --git a/tox.ini b/tox.ini
@@ -28,19 +28,23 @@ commands =
 ; Runs in: Github Actions
 ; Runs GPU-based tests.
 allowlist_externals =
+    bash
     horovodrun
-deps =
-    -rrequirements/test.txt
 passenv =
     OPAL_PREFIX
 setenv =
     TF_GPU_ALLOCATOR=cuda_malloc_async
 sitepackages=true
 commands =
+    # Install Merlin packages
     python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{posargs:main}
     python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git@{posargs:main}
     python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{posargs:main}
-    horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m horovod -rxs tests/unit
+    # Install distributed embeddings and check build
+    # TODO: Move distributed-embeddings installation to CI runner.
+    bash examples/usecases/multi-gpu/install_distributed_embeddings.sh {envtmpdir}
+    # Run multi-gpu tests marked with `horovod` marker
+    horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh python -m pytest -m horovod -rxs tests/unit
 
 [testenv:py38-horovod-cpu]
 setenv =
@@ -51,12 +55,15 @@ setenv =
 commands =
     conda update --yes --name base --channel defaults conda
     conda env create --prefix {envdir}/env --file requirements/horovod-cpu-environment.yml --force
+    # Install horovod and check build
     {envdir}/env/bin/python -m pip install horovod --no-cache-dir
     {envdir}/env/bin/horovodrun --check-build
+    # Install Merlin packages
     {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git
     {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git
     {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git
-    {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m horovod -rxs tests/unit
+    # Run multi-gpu tests marked with `horovod` marker
+    {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh {envdir}/env/bin/python -m pytest -m horovod -rxs tests/unit
 
 [testenv:py38-nvtabular-cpu]
 passenv=GIT_COMMIT