From 94f74b55613531e8f0bf2472d5666d7f78b8f90f Mon Sep 17 00:00:00 2001
From: Marc Romeyn <marcromeyn@gmail.com>
Date: Fri, 30 Jun 2023 10:12:05 +0200
Subject: [PATCH 1/6] Initialize default metrics/loss inside ModelOutput
 instead (#1164)

* Initialize default metrics/loss inside ModelOutput instead

* Trying to change py38 -> py39 in tox.ini to see if that fixes CI

* Trying to change py38 -> py39 in tox.ini to see if that fixes CI

* Trying to change py39 -> py310 in tox.ini to see if that fixes CI

* Update LD_LIBRARY_PATH
---
 .github/workflows/gpu.yml                       |  4 ++--
 merlin/models/torch/outputs/classification.py   | 16 +++++++---------
 merlin/models/torch/outputs/regression.py       | 11 +++++++----
 tests/unit/torch/outputs/test_classification.py |  4 ++--
 tests/unit/torch/outputs/test_regression.py     |  2 +-
 tox.ini                                         | 16 ++++++++--------
 6 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml
index 8999056146..db4b63275e 100644
--- a/.github/workflows/gpu.yml
+++ b/.github/workflows/gpu.yml
@@ -34,7 +34,7 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-gpu
+          cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py310-gpu
 
   tests-examples:
     runs-on: 1GPU
@@ -55,4 +55,4 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-gpu
+          cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py310-gpu
diff --git a/merlin/models/torch/outputs/classification.py b/merlin/models/torch/outputs/classification.py
index e201953fd0..2ca36143f7 100644
--- a/merlin/models/torch/outputs/classification.py
+++ b/merlin/models/torch/outputs/classification.py
@@ -36,24 +36,22 @@ class BinaryOutput(ModelOutput):
         The metrics used for evaluation. Default includes Accuracy, AUROC, Precision, and Recall.
     """
 
+    DEFAULT_LOSS_CLS = nn.BCEWithLogitsLoss
+    DEFAULT_METRICS_CLS = (Accuracy, AUROC, Precision, Recall)
+
     def __init__(
         self,
         schema: Optional[ColumnSchema] = None,
-        loss: nn.Module = nn.BCEWithLogitsLoss(),
-        metrics: Sequence[Metric] = (
-            Accuracy(task="binary"),
-            AUROC(task="binary"),
-            Precision(task="binary"),
-            Recall(task="binary"),
-        ),
+        loss: Optional[nn.Module] = None,
+        metrics: Sequence[Metric] = (),
     ):
         """Initializes a BinaryOutput object."""
         super().__init__(
             nn.LazyLinear(1),
             nn.Sigmoid(),
             schema=schema,
-            loss=loss,
-            metrics=metrics,
+            loss=loss or self.DEFAULT_LOSS_CLS(),
+            metrics=metrics or [m(task="binary") for m in self.DEFAULT_METRICS_CLS],
         )
 
     def setup_schema(self, target: Optional[Union[ColumnSchema, Schema]]):
diff --git a/merlin/models/torch/outputs/regression.py b/merlin/models/torch/outputs/regression.py
index 0f9f9ad318..e3b2f97b09 100644
--- a/merlin/models/torch/outputs/regression.py
+++ b/merlin/models/torch/outputs/regression.py
@@ -36,18 +36,21 @@ class RegressionOutput(ModelOutput):
         The metrics used for evaluation. Default is MeanSquaredError.
     """
 
+    DEFAULT_LOSS_CLS = nn.MSELoss
+    DEFAULT_METRICS_CLS = (MeanSquaredError,)
+
     def __init__(
         self,
         schema: Optional[ColumnSchema] = None,
-        loss: nn.Module = nn.MSELoss(),
-        metrics: Sequence[Metric] = (MeanSquaredError(),),
+        loss: Optional[nn.Module] = None,
+        metrics: Sequence[Metric] = (),
     ):
         """Initializes a RegressionOutput object."""
         super().__init__(
             nn.LazyLinear(1),
             schema=schema,
-            loss=loss,
-            metrics=metrics,
+            loss=loss or self.DEFAULT_LOSS_CLS(),
+            metrics=metrics or [m() for m in self.DEFAULT_METRICS_CLS],
         )
 
     def setup_schema(self, target: Optional[Union[ColumnSchema, Schema]]):
diff --git a/tests/unit/torch/outputs/test_classification.py b/tests/unit/torch/outputs/test_classification.py
index ea6643c740..755d465350 100644
--- a/tests/unit/torch/outputs/test_classification.py
+++ b/tests/unit/torch/outputs/test_classification.py
@@ -31,12 +31,12 @@ def test_init(self):
 
         assert isinstance(binary_output, mm.BinaryOutput)
         assert isinstance(binary_output.loss, nn.BCEWithLogitsLoss)
-        assert binary_output.metrics == (
+        assert binary_output.metrics == [
             Accuracy(task="binary"),
             AUROC(task="binary"),
             Precision(task="binary"),
             Recall(task="binary"),
-        )
+        ]
         assert binary_output.output_schema == Schema()
 
     def test_identity(self):
diff --git a/tests/unit/torch/outputs/test_regression.py b/tests/unit/torch/outputs/test_regression.py
index 17541c5081..f8537bca51 100644
--- a/tests/unit/torch/outputs/test_regression.py
+++ b/tests/unit/torch/outputs/test_regression.py
@@ -30,7 +30,7 @@ def test_init(self):
 
         assert isinstance(reg_output, mm.RegressionOutput)
         assert isinstance(reg_output.loss, nn.MSELoss)
-        assert reg_output.metrics == (MeanSquaredError,)
+        assert reg_output.metrics == [MeanSquaredError()]
         assert reg_output.output_schema == Schema()
 
     def test_identity(self):
diff --git a/tox.ini b/tox.ini
index 2446672a8f..67477b0fb9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,14 +2,14 @@
 ; .github/workflows/cpu-ci.yml for the workflow definition.
 
 [tox]
-envlist = py38-gpu,py38-multi-gpu
+envlist = py310-gpu,py310-multi-gpu
 
 [testenv]
 commands =
     pip install --upgrade pip
     pip install -e .[all]
 
-[testenv:py38-gpu]
+[testenv:py310-gpu]
 ; Runs in: Github Actions
 ; Runs GPU-based tests.
 allowlist_externals =
@@ -28,7 +28,7 @@ commands =
     python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH:main}
     bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs tests/ || ([ $? = 5 ] && exit 0 || exit $?)'
 
-[testenv:py38-multi-gpu]
+[testenv:py310-multi-gpu]
 ; Runs in: Github Actions
 ; Runs GPU-based tests.
 allowlist_externals =
@@ -40,7 +40,7 @@ passenv =
 setenv =
     TF_GPU_ALLOCATOR=cuda_malloc_async
     CPATH={env:CPATH}{:}{envdir}/hugectr/include
-    LD_LIBRARY_PATH=${envdir}/hugectr/include/lib{:}/usr/local/lib/python3.8/dist-packages/tensorflow{:}{env:LD_LIBRARY_PATH}
+    LD_LIBRARY_PATH=${envdir}/hugectr/include/lib{:}/usr/local/lib/python3.10/dist-packages/tensorflow{:}{env:LD_LIBRARY_PATH}
     LIBRARY_PATH=${envdir}/hugectr/lib{:}{env:LIBRARY_PATH}
 sitepackages=true
 commands =
@@ -50,7 +50,7 @@ commands =
     sh examples/usecases/multi-gpu/install_sparse_operation_kit.sh {envdir}
     bash -c 'horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh python -m pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit || ([ $? = 5 ] && exit 0 || exit $?)'
 
-[testenv:py38-horovod-cpu]
+[testenv:py310-horovod-cpu]
 setenv =
     HOROVOD_WITH_MPI=1
     HOROVOD_WITH_TENSORFLOW=1
@@ -66,7 +66,7 @@ commands =
     {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{env:MERLIN_BRANCH:main}
     {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit
 
-[testenv:py38-nvtabular-cpu]
+[testenv:py310-nvtabular-cpu]
 passenv=GIT_COMMIT
 allowlist_externals = git
 deps =
@@ -82,7 +82,7 @@ commands =
     python -m pip install .
     python -m pytest nvtabular-{env:GIT_COMMIT}/tests/unit
 
-[testenv:py38-systems-cpu]
+[testenv:py310-systems-cpu]
 passenv=GIT_COMMIT
 allowlist_externals = git
 deps =
@@ -99,7 +99,7 @@ commands =
     python -m pip install .
     python -m pytest -m "not notebook" systems-{env:GIT_COMMIT}/tests/unit
 
-[testenv:py38-transformers4rec-cpu]
+[testenv:py310-transformers4rec-cpu]
 passenv=GIT_COMMIT
 allowlist_externals = git
 commands =

From b671c8e58231f0a39a4ac6e527dc28506633073b Mon Sep 17 00:00:00 2001
From: tien-nguyen <nguyen.ttq.tien@gmail.com>
Date: Fri, 30 Jun 2023 08:32:55 -0700
Subject: [PATCH 2/6] We need to remove README after the hyper link. (#1163)

Co-authored-by: Marc Romeyn <marcromeyn@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bca6283a66..94a80bd7be 100644
--- a/README.md
+++ b/README.md
@@ -111,7 +111,7 @@ You can find more details and information about a low-level API in our overview
 
 ### Notebook Examples and Tutorials
 
-View the example notebooks in the [documentation](https://nvidia-merlin.github.io/models/stable/examples/README.html) to help you become familiar with Merlin Models.
+View the example notebooks in the [documentation](https://nvidia-merlin.github.io/models/stable/examples/) to help you become familiar with Merlin Models.
 
 The same notebooks are available in the `examples` directory from the [Merlin Models](https://github.com/NVIDIA-Merlin/models) GitHub repository.
 

From 1783951391ae800925e05333f4fc82039c12e102 Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <oholworthy@nvidia.com>
Date: Sat, 1 Jul 2023 01:57:21 +0100
Subject: [PATCH 3/6] Rename and update tox environments (#1168)

* Remove python prefix from tox environment names

* Move Merlin dependencies to deps configuation of tox environment

* Use posargs for tests path in tox environments
---
 .github/workflows/cpu-horovod.yml   |  2 +-
 .github/workflows/cpu-nvtabular.yml |  2 +-
 .github/workflows/cpu-systems.yml   |  2 +-
 .github/workflows/cpu-t4r.yml       |  2 +-
 .github/workflows/gpu-multi.yml     |  2 +-
 .github/workflows/gpu.yml           |  4 +-
 tox.ini                             | 57 +++++++++++++++--------------
 7 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/cpu-horovod.yml b/.github/workflows/cpu-horovod.yml
index 5f08ae3cc1..13c93d2588 100644
--- a/.github/workflows/cpu-horovod.yml
+++ b/.github/workflows/cpu-horovod.yml
@@ -72,4 +72,4 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          EXTRA_PYTEST_MARKERS="$extra_pytest_markers" MERLIN_BRANCH="$merlin_branch" COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-horovod-cpu
+          EXTRA_PYTEST_MARKERS="$extra_pytest_markers" MERLIN_BRANCH="$merlin_branch" COMPARE_BRANCH=${{ github.base_ref }} tox -e horovod-cpu
diff --git a/.github/workflows/cpu-nvtabular.yml b/.github/workflows/cpu-nvtabular.yml
index ab095e5dd9..21143cc2f5 100644
--- a/.github/workflows/cpu-nvtabular.yml
+++ b/.github/workflows/cpu-nvtabular.yml
@@ -64,4 +64,4 @@ jobs:
       - name: Run tests
         run: |
           merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
-          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-nvtabular-cpu
+          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e nvtabular-cpu
diff --git a/.github/workflows/cpu-systems.yml b/.github/workflows/cpu-systems.yml
index 01ef47b35f..0106791290 100644
--- a/.github/workflows/cpu-systems.yml
+++ b/.github/workflows/cpu-systems.yml
@@ -64,4 +64,4 @@ jobs:
       - name: Run tests
         run: |
           merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
-          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-systems-cpu
+          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e systems-cpu
diff --git a/.github/workflows/cpu-t4r.yml b/.github/workflows/cpu-t4r.yml
index cdda721ec4..2879fb9614 100644
--- a/.github/workflows/cpu-t4r.yml
+++ b/.github/workflows/cpu-t4r.yml
@@ -60,4 +60,4 @@ jobs:
       - name: Run tests
         run: |
           merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
-          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-transformers4rec-cpu
+          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e transformers4rec-cpu
diff --git a/.github/workflows/gpu-multi.yml b/.github/workflows/gpu-multi.yml
index 62e26961b9..3d47558932 100644
--- a/.github/workflows/gpu-multi.yml
+++ b/.github/workflows/gpu-multi.yml
@@ -56,4 +56,4 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          cd ${{ github.workspace }}; EXTRA_PYTEST_MARKERS=$extra_pytest_markers MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-multi-gpu
+          cd ${{ github.workspace }}; EXTRA_PYTEST_MARKERS=$extra_pytest_markers MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e multi-gpu
diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml
index db4b63275e..90478661c9 100644
--- a/.github/workflows/gpu.yml
+++ b/.github/workflows/gpu.yml
@@ -34,7 +34,7 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py310-gpu
+          cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu
 
   tests-examples:
     runs-on: 1GPU
@@ -55,4 +55,4 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py310-gpu
+          cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu
diff --git a/tox.ini b/tox.ini
index 67477b0fb9..dc516cde78 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,33 +2,34 @@
 ; .github/workflows/cpu-ci.yml for the workflow definition.
 
 [tox]
-envlist = py310-gpu,py310-multi-gpu
+envlist = gpu,multi-gpu,horovod-cpu,nvtabular-cpu,systems-cpu,transformers4rec-cpu,docs,docs-multi
 
 [testenv]
 commands =
     pip install --upgrade pip
     pip install -e .[all]
 
-[testenv:py310-gpu]
+[testenv:gpu]
 ; Runs in: Github Actions
 ; Runs GPU-based tests.
 allowlist_externals =
     bash
 deps =
-    --no-deps -rrequirements/test.txt
+    -rrequirements/test.txt
+    git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH}
 passenv =
     OPAL_PREFIX
 setenv =
     TF_GPU_ALLOCATOR=cuda_malloc_async
 sitepackages=true
 commands =
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main}
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main}
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{env:MERLIN_BRANCH:main}
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH:main}
-    bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs tests/ || ([ $? = 5 ] && exit 0 || exit $?)'
+    bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)'
+
 
-[testenv:py310-multi-gpu]
+[testenv:multi-gpu]
 ; Runs in: Github Actions
 ; Runs GPU-based tests.
 allowlist_externals =
@@ -43,30 +44,32 @@ setenv =
     LD_LIBRARY_PATH=${envdir}/hugectr/include/lib{:}/usr/local/lib/python3.10/dist-packages/tensorflow{:}{env:LD_LIBRARY_PATH}
     LIBRARY_PATH=${envdir}/hugectr/lib{:}{env:LIBRARY_PATH}
 sitepackages=true
+deps =
+    git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH}
 commands =
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main}
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main}
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{env:MERLIN_BRANCH:main}
     sh examples/usecases/multi-gpu/install_sparse_operation_kit.sh {envdir}
-    bash -c 'horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh python -m pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit || ([ $? = 5 ] && exit 0 || exit $?)'
+    bash -c 'horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh python -m pytest -m "unit and horovod {env:EXTRA_PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)'
 
-[testenv:py310-horovod-cpu]
+[testenv:horovod-cpu]
 setenv =
     HOROVOD_WITH_MPI=1
     HOROVOD_WITH_TENSORFLOW=1
     PATH={env:PATH}{:}{envdir}/env/bin
     LD_LIBRARY_PATH={env:LD_LIBRARY_PATH}{:}{envdir}/env/lib
+deps =
+    git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH}
 commands =
     conda update --yes --name base --channel defaults conda
     conda env create --prefix {envdir}/env --file requirements/horovod-cpu-environment.yml --force
     {envdir}/env/bin/python -m pip install 'horovod==0.27.0' --no-cache-dir
     {envdir}/env/bin/horovodrun --check-build
-    {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main}
-    {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main}
-    {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{env:MERLIN_BRANCH:main}
-    {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit
+    {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m "unit and horovod {env:EXTRA_PYTEST_MARKERS}" -rxs {posargs:tests}
 
-[testenv:py310-nvtabular-cpu]
+[testenv:nvtabular-cpu]
 passenv=GIT_COMMIT
 allowlist_externals = git
 deps =
@@ -82,7 +85,7 @@ commands =
     python -m pip install .
     python -m pytest nvtabular-{env:GIT_COMMIT}/tests/unit
 
-[testenv:py310-systems-cpu]
+[testenv:systems-cpu]
 passenv=GIT_COMMIT
 allowlist_externals = git
 deps =
@@ -99,7 +102,7 @@ commands =
     python -m pip install .
     python -m pytest -m "not notebook" systems-{env:GIT_COMMIT}/tests/unit
 
-[testenv:py310-transformers4rec-cpu]
+[testenv:transformers4rec-cpu]
 passenv=GIT_COMMIT
 allowlist_externals = git
 commands =
@@ -120,10 +123,10 @@ changedir = {toxinidir}
 deps =
     -rrequirements/docs.txt
     -rrequirements/test.txt
+    git+https://github.com/NVIDIA-Merlin/core.git
+    git+https://github.com/NVIDIA-Merlin/dataloader.git
+    git+https://github.com/NVIDIA-Merlin/NVTabular.git
 commands =
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git
     python -m sphinx.cmd.build -E -P -b html docs/source docs/build/html
 
 [testenv:docs-multi]
@@ -132,9 +135,9 @@ changedir = {toxinidir}
 deps =
     -rrequirements/docs.txt
     -rrequirements/test.txt
+    git+https://github.com/NVIDIA-Merlin/core.git
+    git+https://github.com/NVIDIA-Merlin/dataloader.git
+    git+https://github.com/NVIDIA-Merlin/NVTabular.git
 commands =
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git
-    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git
     sphinx-multiversion --dump-metadata docs/source docs/build/html | jq "keys"
     sphinx-multiversion docs/source docs/build/html

From b89f66e13dbce55dac4cf724f74b7a34cfe7b1c3 Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <oholworthy@nvidia.com>
Date: Sat, 1 Jul 2023 02:37:48 +0100
Subject: [PATCH 4/6] Add fixture to cleanup dataloader (#1167)

Co-authored-by: edknv <109497216+edknv@users.noreply.github.com>
---
 tests/conftest.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 6e5daeb58b..386ea8884f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -20,6 +20,7 @@
 import platform
 import warnings
 from pathlib import Path
+from unittest.mock import patch
 
 import distributed
 import psutil
@@ -27,6 +28,7 @@
 from asvdb import BenchmarkInfo, utils
 
 from merlin.core.utils import Distributed
+from merlin.dataloader.loader_base import LoaderBase
 from merlin.datasets.synthetic import generate_data
 from merlin.io import Dataset
 from merlin.models.utils import ci_utils
@@ -145,3 +147,17 @@ def get_benchmark_info():
         arch=uname.machine,
         ram="%d" % psutil.virtual_memory().total,
     )
+
+
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_dataloader():
+    """After each test runs. Call .stop() on any dataloaders created during the test.
+    The avoids issues with background threads hanging around and interfering with subsequent tests.
+    This happens when a dataloader is partially consumed (not all batches are iterated through).
+    """
+    with patch.object(
+        LoaderBase, "__iter__", side_effect=LoaderBase.__iter__, autospec=True
+    ) as patched:
+        yield
+        for call in patched.call_args_list:
+            call.args[0].stop()

From 25f98f1566eb05ad8df3788e1dee550963a78fed Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <oholworthy@nvidia.com>
Date: Sat, 1 Jul 2023 04:20:25 +0100
Subject: [PATCH 5/6] Replace import of collections.Sequence with
 collections.abc.Sequence (#1166)

Co-authored-by: edknv <109497216+edknv@users.noreply.github.com>
---
 merlin/models/tf/core/tabular.py     | 4 ++--
 merlin/models/tf/inputs/embedding.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/merlin/models/tf/core/tabular.py b/merlin/models/tf/core/tabular.py
index 33b1ed5b42..3f2bb8e574 100644
--- a/merlin/models/tf/core/tabular.py
+++ b/merlin/models/tf/core/tabular.py
@@ -1,5 +1,5 @@
 import abc
-import collections
+import collections.abc
 import copy
 from typing import Dict, List, Optional, Sequence, Union, overload
 
@@ -600,7 +600,7 @@ def get_config(self):
     def select_by_tag(self, tags: Tags) -> Optional["Filter"]:
         if isinstance(self.feature_names, Tags):
             schema = self.schema.select_by_tag(self.feature_names).select_by_tag(tags)
-        elif isinstance(self.feature_names, collections.Sequence):
+        elif isinstance(self.feature_names, collections.abc.Sequence):
             schema = self.schema.select_by_name(self.feature_names).select_by_tag(tags)
         else:
             raise RuntimeError(
diff --git a/merlin/models/tf/inputs/embedding.py b/merlin/models/tf/inputs/embedding.py
index aff30184a6..156ef974e8 100644
--- a/merlin/models/tf/inputs/embedding.py
+++ b/merlin/models/tf/inputs/embedding.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import collections
+import collections.abc
 import inspect
 from copy import deepcopy
 from dataclasses import dataclass
@@ -268,7 +268,7 @@ def select_by_tag(self, tags: Union[Tags, Sequence[Tags]]) -> Optional["Embeddin
         -------
         An EmbeddingTable if the tags match. If no features match, it returns None.
         """
-        if not isinstance(tags, collections.Sequence):
+        if not isinstance(tags, collections.abc.Sequence):
             tags = [tags]
 
         selected_schema = self.schema.select_by_tag(tags)

From 86d0a34bf5902fe1510678eafccd5841810f4f5e Mon Sep 17 00:00:00 2001
From: edknv <109497216+edknv@users.noreply.github.com>
Date: Sat, 1 Jul 2023 19:04:12 +0900
Subject: [PATCH 6/6] Add DLRM block (#1162)

* First pass over DLRM-related blocks

* add a test for dlrm block and fixes to make test pass

* Add unit tests

* Initialize default metrics/loss inside ModelOutput instead

* Update merlin/models/torch/blocks/dlrm.py

Co-authored-by: Radek Osmulski <rosmulski@gmail.com>

* fixes and changes for failing tests

* pass batch to model test

---------

Co-authored-by: Marc Romeyn <marcromeyn@gmail.com>
Co-authored-by: Radek Osmulski <rosmulski@gmail.com>
---
 merlin/models/torch/__init__.py          |   2 +
 merlin/models/torch/blocks/dlrm.py       | 141 +++++++++++++++++++++++
 merlin/models/torch/router.py            |   3 +
 merlin/models/torch/transforms/agg.py    |   3 +
 tests/unit/torch/blocks/test_dlrm.py     | 112 ++++++++++++++++++
 tests/unit/torch/models/test_base.py     |   6 +-
 tests/unit/torch/outputs/test_tabular.py |   6 +-
 7 files changed, 268 insertions(+), 5 deletions(-)
 create mode 100644 merlin/models/torch/blocks/dlrm.py
 create mode 100644 tests/unit/torch/blocks/test_dlrm.py

diff --git a/merlin/models/torch/__init__.py b/merlin/models/torch/__init__.py
index eef6f9ece6..d2326af5e9 100644
--- a/merlin/models/torch/__init__.py
+++ b/merlin/models/torch/__init__.py
@@ -17,6 +17,7 @@
 from merlin.models.torch import schema
 from merlin.models.torch.batch import Batch, Sequence
 from merlin.models.torch.block import Block, ParallelBlock
+from merlin.models.torch.blocks.dlrm import DLRMBlock
 from merlin.models.torch.blocks.mlp import MLPBlock
 from merlin.models.torch.inputs.embedding import EmbeddingTable, EmbeddingTables
 from merlin.models.torch.inputs.select import SelectFeatures, SelectKeys
@@ -51,4 +52,5 @@
     "Concat",
     "Stack",
     "schema",
+    "DLRMBlock",
 ]
diff --git a/merlin/models/torch/blocks/dlrm.py b/merlin/models/torch/blocks/dlrm.py
new file mode 100644
index 0000000000..a24e4d1f71
--- /dev/null
+++ b/merlin/models/torch/blocks/dlrm.py
@@ -0,0 +1,141 @@
+from typing import Dict, Optional
+
+import torch
+from torch import nn
+
+from merlin.models.torch.block import Block
+from merlin.models.torch.inputs.embedding import EmbeddingTables
+from merlin.models.torch.inputs.tabular import TabularInputBlock
+from merlin.models.torch.link import Link
+from merlin.models.torch.transforms.agg import MaybeAgg, Stack
+from merlin.models.utils.doc_utils import docstring_parameter
+from merlin.schema import Schema, Tags
+
+_DLRM_REF = """
+    References
+    ----------
+    ..  [1] Naumov, Maxim, et al. "Deep learning recommendation model for
+        personalization and recommendation systems." arXiv preprint arXiv:1906.00091 (2019).
+"""
+
+
+@docstring_parameter(dlrm_reference=_DLRM_REF)
+class DLRMInputBlock(TabularInputBlock):
+    """Input block for DLRM model.
+
+    Parameters
+    ----------
+    schema : Schema, optional
+        The schema to use for selection. Default is None.
+    dim : int
+        The dimensionality of the output vectors.
+    bottom_block : Block
+        Block to pass the continuous features to.
+        Note that, the output dimensionality of this block must be equal to ``dim``.
+
+    {dlrm_reference}
+
+    Raises
+    ------
+    ValueError
+        If no categorical input is provided in the schema.
+
+    """
+
+    def __init__(self, schema: Schema, dim: int, bottom_block: Block):
+        super().__init__(schema)
+        self.add_route(Tags.CATEGORICAL, EmbeddingTables(dim, seq_combiner="mean"))
+        self.add_route(Tags.CONTINUOUS, bottom_block)
+
+        if "categorical" not in self:
+            raise ValueError("DLRMInputBlock must have a categorical input")
+
+
+@docstring_parameter(dlrm_reference=_DLRM_REF)
+class DLRMInteraction(nn.Module):
+    """
+    This class defines the forward interaction operation as proposed
+    in the DLRM
+     `paper https://arxiv.org/pdf/1906.00091.pdf`_ [1]_.
+
+    This forward operation performs elementwise multiplication
+    followed by a reduction sum (equivalent to a dot product) of all embedding pairs.
+
+    {dlrm_reference}
+
+    """
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if not hasattr(self, "triu_indices"):
+            self.register_buffer(
+                "triu_indices", torch.triu_indices(inputs.shape[1], inputs.shape[1], offset=1)
+            )
+
+        interactions = torch.bmm(inputs, torch.transpose(inputs, 1, 2))
+        interactions_flat = interactions[:, self.triu_indices[0], self.triu_indices[1]]
+
+        return interactions_flat
+
+
+class ShortcutConcatContinuous(Link):
+    """
+    A shortcut connection that concatenates
+    continuous input features and intermediate outputs.
+
+    When there's no continuous input, the intermediate output is returned.
+    """
+
+    def forward(self, inputs: Dict[str, torch.Tensor]) -> torch.Tensor:
+        intermediate_output = self.output(inputs)
+
+        if "continuous" in inputs:
+            return torch.cat((inputs["continuous"], intermediate_output), dim=1)
+
+        return intermediate_output
+
+
+@docstring_parameter(dlrm_reference=_DLRM_REF)
+class DLRMBlock(Block):
+    """Builds the DLRM architecture, as proposed in the following
+     `paper https://arxiv.org/pdf/1906.00091.pdf`_ [1]_.
+
+    Parameters
+    ----------
+    schema : Schema, optional
+        The schema to use for selection. Default is None.
+    dim : int
+        The dimensionality of the output vectors.
+    bottom_block : Block
+        Block to pass the continuous features to.
+        Note that, the output dimensionality of this block must be equal to ``dim``.
+    top_block : Block, optional
+        An optional upper-level block of the model.
+    interaction : nn.Module, optional
+        Interaction module for DLRM.
+        If not provided, DLRMInteraction will be used by default.
+
+    {dlrm_reference}
+
+    Raises
+    ------
+    ValueError
+        If no categorical input is provided in the schema.
+    """
+
+    def __init__(
+        self,
+        schema: Schema,
+        dim: int,
+        bottom_block: Block,
+        top_block: Optional[Block] = None,
+        interaction: Optional[nn.Module] = None,
+    ):
+        super().__init__(DLRMInputBlock(schema, dim, bottom_block))
+
+        self.append(
+            Block(MaybeAgg(Stack(dim=1)), interaction or DLRMInteraction()),
+            link=ShortcutConcatContinuous(),
+        )
+
+        if top_block:
+            self.append(top_block)
diff --git a/merlin/models/torch/router.py b/merlin/models/torch/router.py
index 29126e0f91..326064c68c 100644
--- a/merlin/models/torch/router.py
+++ b/merlin/models/torch/router.py
@@ -88,6 +88,9 @@ def add_route(
         """
 
         routing_module = schema.select(self.selectable, selection)
+        if not routing_module:
+            return self
+
         if module is not None:
             schema.setup_schema(module, routing_module.schema)
 
diff --git a/merlin/models/torch/transforms/agg.py b/merlin/models/torch/transforms/agg.py
index f6dcf457d6..552fcf1d32 100644
--- a/merlin/models/torch/transforms/agg.py
+++ b/merlin/models/torch/transforms/agg.py
@@ -104,6 +104,9 @@ def forward(self, inputs: Dict[str, torch.Tensor]) -> torch.Tensor:
 
         if self.align_dims:
             max_dims = max(tensor.dim() for tensor in sorted_tensors)
+            max_dims = max(
+                max_dims, 2
+            )  # assume first dimension is batch size + at least one feature
             _sorted_tensors = []
             for tensor in sorted_tensors:
                 if tensor.dim() < max_dims:
diff --git a/tests/unit/torch/blocks/test_dlrm.py b/tests/unit/torch/blocks/test_dlrm.py
new file mode 100644
index 0000000000..21d65e8561
--- /dev/null
+++ b/tests/unit/torch/blocks/test_dlrm.py
@@ -0,0 +1,112 @@
+import math
+
+import pytest
+import torch
+
+import merlin.models.torch as mm
+from merlin.models.torch.batch import sample_batch
+from merlin.models.torch.blocks.dlrm import DLRMInputBlock, DLRMInteraction
+from merlin.models.torch.utils import module_utils
+from merlin.schema import Tags
+
+
+class TestDLRMInputBlock:
+    def test_routes_and_output_shapes(self, testing_data):
+        schema = testing_data.schema
+        embedding_dim = 64
+        block = DLRMInputBlock(schema, embedding_dim, mm.MLPBlock([embedding_dim]))
+
+        assert isinstance(block["categorical"], mm.EmbeddingTables)
+        assert len(block["categorical"]) == len(schema.select_by_tag(Tags.CATEGORICAL))
+
+        assert isinstance(block["continuous"][0], mm.SelectKeys)
+        assert isinstance(block["continuous"][1], mm.MLPBlock)
+
+        batch_size = 16
+        batch = sample_batch(testing_data, batch_size=batch_size)
+
+        outputs = module_utils.module_test(block, batch)
+
+        for col in schema.select_by_tag(Tags.CATEGORICAL):
+            assert outputs[col.name].shape == (batch_size, embedding_dim)
+        assert outputs["continuous"].shape == (batch_size, embedding_dim)
+
+
+class TestDLRMInteraction:
+    @pytest.mark.parametrize(
+        "batch_size,num_features,dim",
+        [(16, 3, 3), (32, 5, 8), (64, 5, 4)],
+    )
+    def test_output_shape(self, batch_size, num_features, dim):
+        module = DLRMInteraction()
+        inputs = torch.rand((batch_size, num_features, dim))
+        outputs = module_utils.module_test(module, inputs)
+
+        assert outputs.shape == (batch_size, num_features - 1 + math.comb(num_features - 1, 2))
+
+
+class TestDLRMBlock:
+    @pytest.fixture(autouse=True)
+    def setup_method(self, testing_data):
+        self.schema = testing_data.schema
+        self.batch_size = 16
+        self.batch = sample_batch(testing_data, batch_size=self.batch_size)
+
+    def test_dlrm_output_shape(self):
+        embedding_dim = 64
+        block = mm.DLRMBlock(
+            self.schema,
+            dim=embedding_dim,
+            bottom_block=mm.MLPBlock([embedding_dim]),
+        )
+
+        outputs = module_utils.module_test(block, self.batch)
+
+        num_features = len(self.schema.select_by_tag(Tags.CATEGORICAL)) + 1
+        dot_product_dim = (num_features - 1) * num_features // 2
+        assert list(outputs.shape) == [self.batch_size, dot_product_dim + embedding_dim]
+
+    def test_dlrm_with_top_block(self):
+        embedding_dim = 32
+        top_block_dim = 8
+        block = mm.DLRMBlock(
+            self.schema,
+            dim=embedding_dim,
+            bottom_block=mm.MLPBlock([embedding_dim]),
+            top_block=mm.MLPBlock([top_block_dim]),
+        )
+
+        outputs = module_utils.module_test(block, self.batch)
+
+        assert list(outputs.shape) == [self.batch_size, top_block_dim]
+
+    def test_dlrm_block_no_categorical_features(self):
+        schema = self.schema.remove_by_tag(Tags.CATEGORICAL)
+        embedding_dim = 32
+
+        with pytest.raises(ValueError, match="must have a categorical input"):
+            _ = mm.DLRMBlock(
+                schema,
+                dim=embedding_dim,
+                bottom_block=mm.MLPBlock([embedding_dim]),
+            )
+
+    def test_dlrm_block_no_continuous_features(self, testing_data):
+        schema = testing_data.schema.remove_by_tag(Tags.CONTINUOUS)
+        testing_data.schema = schema
+
+        embedding_dim = 32
+        block = mm.DLRMBlock(
+            schema,
+            dim=embedding_dim,
+            bottom_block=mm.MLPBlock([embedding_dim]),
+        )
+
+        batch_size = 16
+        batch = sample_batch(testing_data, batch_size=batch_size)
+
+        outputs = module_utils.module_test(block, batch)
+
+        num_features = len(schema.select_by_tag(Tags.CATEGORICAL))
+        dot_product_dim = (num_features - 1) * num_features // 2
+        assert list(outputs.shape) == [batch_size, dot_product_dim]
diff --git a/tests/unit/torch/models/test_base.py b/tests/unit/torch/models/test_base.py
index cedd3e6ff6..ab329b8ca1 100644
--- a/tests/unit/torch/models/test_base.py
+++ b/tests/unit/torch/models/test_base.py
@@ -133,11 +133,11 @@ def test_training_step_with_dataloader(self):
             mm.BinaryOutput(ColumnSchema("target")),
         )
 
-        feature = [[1.0, 2.0], [3.0, 4.0]]
-        target = [[0.0], [1.0]]
+        feature = [2.0, 3.0]
+        target = [0.0, 1.0]
         dataset = Dataset(pd.DataFrame({"feature": feature, "target": target}))
 
-        with Loader(dataset, batch_size=1) as loader:
+        with Loader(dataset, batch_size=2) as loader:
             model.initialize(loader)
             batch = loader.peek()
 
diff --git a/tests/unit/torch/outputs/test_tabular.py b/tests/unit/torch/outputs/test_tabular.py
index b3dd2abe4b..22ae132735 100644
--- a/tests/unit/torch/outputs/test_tabular.py
+++ b/tests/unit/torch/outputs/test_tabular.py
@@ -29,6 +29,8 @@ def test_exceptions(self):
         with pytest.raises(ValueError, match="not found"):
             mm.TabularOutputBlock(self.schema, init="not_found")
 
+    def test_no_route_for_non_existent_tag(self):
         outputs = mm.TabularOutputBlock(self.schema)
-        with pytest.raises(ValueError):
-            outputs.add_route(Tags.CATEGORICAL)
+        outputs.add_route(Tags.CATEGORICAL)
+
+        assert not outputs