From f9fb4ce0f69f330fa4624b12ddcf14ee20716f5b Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 13:56:40 +0100 Subject: [PATCH 01/27] Remove python prefix from tox environment names --- .github/workflows/cpu-horovod.yml | 2 +- .github/workflows/cpu-nvtabular.yml | 2 +- .github/workflows/cpu-systems.yml | 2 +- .github/workflows/cpu-t4r.yml | 2 +- .github/workflows/gpu-multi.yml | 2 +- .github/workflows/gpu.yml | 4 ++-- tox.ini | 14 +++++++------- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/cpu-horovod.yml b/.github/workflows/cpu-horovod.yml index 5f08ae3cc1..13c93d2588 100644 --- a/.github/workflows/cpu-horovod.yml +++ b/.github/workflows/cpu-horovod.yml @@ -72,4 +72,4 @@ jobs: if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then extra_pytest_markers="and changed" fi - EXTRA_PYTEST_MARKERS="$extra_pytest_markers" MERLIN_BRANCH="$merlin_branch" COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-horovod-cpu + EXTRA_PYTEST_MARKERS="$extra_pytest_markers" MERLIN_BRANCH="$merlin_branch" COMPARE_BRANCH=${{ github.base_ref }} tox -e horovod-cpu diff --git a/.github/workflows/cpu-nvtabular.yml b/.github/workflows/cpu-nvtabular.yml index ab095e5dd9..21143cc2f5 100644 --- a/.github/workflows/cpu-nvtabular.yml +++ b/.github/workflows/cpu-nvtabular.yml @@ -64,4 +64,4 @@ jobs: - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-nvtabular-cpu + MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e nvtabular-cpu diff --git a/.github/workflows/cpu-systems.yml b/.github/workflows/cpu-systems.yml index 01ef47b35f..0106791290 100644 --- a/.github/workflows/cpu-systems.yml +++ b/.github/workflows/cpu-systems.yml @@ -64,4 +64,4 @@ jobs: - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-systems-cpu + MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e systems-cpu diff --git a/.github/workflows/cpu-t4r.yml b/.github/workflows/cpu-t4r.yml index cdda721ec4..2879fb9614 100644 --- a/.github/workflows/cpu-t4r.yml +++ b/.github/workflows/cpu-t4r.yml @@ -60,4 +60,4 @@ jobs: - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-transformers4rec-cpu + MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e transformers4rec-cpu diff --git a/.github/workflows/gpu-multi.yml b/.github/workflows/gpu-multi.yml index 62e26961b9..3d47558932 100644 --- a/.github/workflows/gpu-multi.yml +++ b/.github/workflows/gpu-multi.yml @@ -56,4 +56,4 @@ jobs: if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then extra_pytest_markers="and changed" fi - cd ${{ github.workspace }}; EXTRA_PYTEST_MARKERS=$extra_pytest_markers MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-multi-gpu + cd ${{ github.workspace }}; EXTRA_PYTEST_MARKERS=$extra_pytest_markers MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e multi-gpu diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index db4b63275e..90478661c9 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -34,7 +34,7 @@ jobs: if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then extra_pytest_markers="and changed" fi - cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py310-gpu + cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu tests-examples: runs-on: 1GPU @@ -55,4 +55,4 @@ jobs: if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then extra_pytest_markers="and changed" fi - cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py310-gpu + cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu diff --git a/tox.ini b/tox.ini index 67477b0fb9..c530578a6a 100644 --- a/tox.ini +++ b/tox.ini @@ -2,14 +2,14 @@ ; .github/workflows/cpu-ci.yml for the workflow definition. [tox] -envlist = py310-gpu,py310-multi-gpu +envlist = gpu,multi-gpu,horovod-cpu,nvtabular-cpu,systems-cpu,transformers4rec-cpu,docs,docs-multi [testenv] commands = pip install --upgrade pip pip install -e .[all] -[testenv:py310-gpu] +[testenv:gpu] ; Runs in: Github Actions ; Runs GPU-based tests. allowlist_externals = @@ -28,7 +28,7 @@ commands = python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH:main} bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs tests/ || ([ $? = 5 ] && exit 0 || exit $?)' -[testenv:py310-multi-gpu] +[testenv:multi-gpu] ; Runs in: Github Actions ; Runs GPU-based tests. allowlist_externals = @@ -50,7 +50,7 @@ commands = sh examples/usecases/multi-gpu/install_sparse_operation_kit.sh {envdir} bash -c 'horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh python -m pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit || ([ $? = 5 ] && exit 0 || exit $?)' -[testenv:py310-horovod-cpu] +[testenv:horovod-cpu] setenv = HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 @@ -66,7 +66,7 @@ commands = {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{env:MERLIN_BRANCH:main} {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit -[testenv:py310-nvtabular-cpu] +[testenv:nvtabular-cpu] passenv=GIT_COMMIT allowlist_externals = git deps = @@ -82,7 +82,7 @@ commands = python -m pip install . python -m pytest nvtabular-{env:GIT_COMMIT}/tests/unit -[testenv:py310-systems-cpu] +[testenv:systems-cpu] passenv=GIT_COMMIT allowlist_externals = git deps = @@ -99,7 +99,7 @@ commands = python -m pip install . python -m pytest -m "not notebook" systems-{env:GIT_COMMIT}/tests/unit -[testenv:py310-transformers4rec-cpu] +[testenv:transformers4rec-cpu] passenv=GIT_COMMIT allowlist_externals = git commands = From a0e2ba024d314758a6374da9e1151a409d73388f Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 13:57:38 +0100 Subject: [PATCH 02/27] Add transformers to test requirements --- requirements/test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/test.txt b/requirements/test.txt index 91c50ffa24..b013591b21 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,6 @@ -r dev.txt -r pytorch.txt -r tensorflow.txt +-r transformers.txt numpy<1.24 From 60eb203cb7e8b5463e749e917a7c1706be8ea2bb Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 13:58:46 +0100 Subject: [PATCH 03/27] Add gpu-cu11 tox environment --- tox.ini | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index c530578a6a..d6faf2ec05 100644 --- a/tox.ini +++ b/tox.ini @@ -2,13 +2,36 @@ ; .github/workflows/cpu-ci.yml for the workflow definition. [tox] -envlist = gpu,multi-gpu,horovod-cpu,nvtabular-cpu,systems-cpu,transformers4rec-cpu,docs,docs-multi +envlist = gpu,gpu-cu11,multi-gpu,horovod-cpu,nvtabular-cpu,systems-cpu,transformers4rec-cpu,docs,docs-multi [testenv] commands = pip install --upgrade pip pip install -e .[all] +[testenv:gpu-cu11] +; Runs in: GitHub Actions +; Runs GPU-based tests. +setenv = + TF_GPU_ALLOCATOR=cuda_malloc_async + PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +allowlist_externals = + bash +passenv = + CUDA_VISIBLE_DEVICES +deps = + -rrequirements/test.txt + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH} + nvidia-cudnn-cu11~=8.6.0 + cudf-cu11=={env:RAPIDS_VERSION} + dask-cudf-cu11=={env:RAPIDS_VERSION} +commands = + bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)' + [testenv:gpu] ; Runs in: Github Actions ; Runs GPU-based tests. From bc8918a6b47cae9d02954b9a42771b2a736264ab Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:03:15 +0100 Subject: [PATCH 04/27] Move Merlin dependencies to deps configuation of tox environment --- tox.ini | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/tox.ini b/tox.ini index d6faf2ec05..bd1fd711f6 100644 --- a/tox.ini +++ b/tox.ini @@ -38,19 +38,20 @@ commands = allowlist_externals = bash deps = - --no-deps -rrequirements/test.txt + -rrequirements/test.txt + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH} passenv = OPAL_PREFIX setenv = TF_GPU_ALLOCATOR=cuda_malloc_async sitepackages=true commands = - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main} - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{env:MERLIN_BRANCH:main} - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH:main} bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs tests/ || ([ $? = 5 ] && exit 0 || exit $?)' + [testenv:multi-gpu] ; Runs in: Github Actions ; Runs GPU-based tests. @@ -66,10 +67,11 @@ setenv = LD_LIBRARY_PATH=${envdir}/hugectr/include/lib{:}/usr/local/lib/python3.10/dist-packages/tensorflow{:}{env:LD_LIBRARY_PATH} LIBRARY_PATH=${envdir}/hugectr/lib{:}{env:LIBRARY_PATH} sitepackages=true +deps = + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH} commands = - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main} - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{env:MERLIN_BRANCH:main} sh examples/usecases/multi-gpu/install_sparse_operation_kit.sh {envdir} bash -c 'horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh python -m pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit || ([ $? = 5 ] && exit 0 || exit $?)' @@ -79,14 +81,15 @@ setenv = HOROVOD_WITH_TENSORFLOW=1 PATH={env:PATH}{:}{envdir}/env/bin LD_LIBRARY_PATH={env:LD_LIBRARY_PATH}{:}{envdir}/env/lib +deps = + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH} commands = conda update --yes --name base --channel defaults conda conda env create --prefix {envdir}/env --file requirements/horovod-cpu-environment.yml --force {envdir}/env/bin/python -m pip install 'horovod==0.27.0' --no-cache-dir {envdir}/env/bin/horovodrun --check-build - {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} - {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main} - {envdir}/env/bin/python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git@{env:MERLIN_BRANCH:main} {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit [testenv:nvtabular-cpu] @@ -143,10 +146,10 @@ changedir = {toxinidir} deps = -rrequirements/docs.txt -rrequirements/test.txt + git+https://github.com/NVIDIA-Merlin/core.git + git+https://github.com/NVIDIA-Merlin/dataloader.git + git+https://github.com/NVIDIA-Merlin/NVTabular.git commands = - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git python -m sphinx.cmd.build -E -P -b html docs/source docs/build/html [testenv:docs-multi] @@ -155,9 +158,9 @@ changedir = {toxinidir} deps = -rrequirements/docs.txt -rrequirements/test.txt + git+https://github.com/NVIDIA-Merlin/core.git + git+https://github.com/NVIDIA-Merlin/dataloader.git + git+https://github.com/NVIDIA-Merlin/NVTabular.git commands = - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git sphinx-multiversion --dump-metadata docs/source docs/build/html | jq "keys" sphinx-multiversion docs/source docs/build/html From 2914452574874a9f07e9eb9c9f2233fd0108fba2 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:06:05 +0100 Subject: [PATCH 05/27] Use posargs for tests path in tox environments --- tox.ini | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tox.ini b/tox.ini index bd1fd711f6..3d624c88c5 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,7 @@ setenv = TF_GPU_ALLOCATOR=cuda_malloc_async sitepackages=true commands = - bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs tests/ || ([ $? = 5 ] && exit 0 || exit $?)' + bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)' [testenv:multi-gpu] @@ -73,7 +73,7 @@ deps = git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH} commands = sh examples/usecases/multi-gpu/install_sparse_operation_kit.sh {envdir} - bash -c 'horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh python -m pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit || ([ $? = 5 ] && exit 0 || exit $?)' + bash -c 'horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh python -m pytest -m "unit and horovod {env:EXTRA_PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)' [testenv:horovod-cpu] setenv = @@ -90,7 +90,7 @@ commands = conda env create --prefix {envdir}/env --file requirements/horovod-cpu-environment.yml --force {envdir}/env/bin/python -m pip install 'horovod==0.27.0' --no-cache-dir {envdir}/env/bin/horovodrun --check-build - {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m "horovod {env:EXTRA_PYTEST_MARKERS}" -rxs tests/unit + {envdir}/env/bin/horovodrun -np 2 sh examples/usecases/multi-gpu/hvd_wrapper.sh pytest -m "unit and horovod {env:EXTRA_PYTEST_MARKERS}" -rxs {posargs:tests} [testenv:nvtabular-cpu] passenv=GIT_COMMIT From 2c7f3376c5bb83716db86c4af75d3b38a7c0f0be Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:20:40 +0100 Subject: [PATCH 06/27] Run single GPU tests in nvidia/tensorflow and nvidia/cuda images --- .github/workflows/gpu.yml | 62 +++++++++++++++++++++++++++++++++------ tox.ini | 2 ++ 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 90478661c9..28b5bc797c 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -16,25 +16,69 @@ concurrency: jobs: gpu-ci: - runs-on: 1GPU - + runs-on: linux-amd64-gpu-p100-latest-1 + container: + image: nvcr.io/nvidia/tensorflow:23.06-tf2-py3 + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: - uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Install Ubuntu packages + run: | + apt-get update -y + apt-get install -y lsb-release + - name: Install and upgrade python packages + run: | + python -m pip install --upgrade pip tox + - name: Get Branch name + id: get-branch-name + uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request - name: Run tests run: | - ref_type=${{ github.ref_type }} - branch=main - if [[ $ref_type == "tag"* ]] - then - git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release* - branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///') + if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then + extra_pytest_markers="and changed" fi + merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + MERLIN_BRANCH=$merlin_branch \ + PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ + tox -e gpu + + gpu-cu11: + runs-on: linux-amd64-gpu-p100-latest-1 + container: + image: nvidia/cuda:11.8.0-devel-ubuntu22.04 + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Install Ubuntu packages + run: | + apt-get update -y + # libcudnn8 installed for tensorflow GPU support + apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install and upgrade python packages + run: | + python -m pip install --upgrade pip tox + - name: Get Branch name + id: get-branch-name + uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request + - name: Run tests + run: | if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then extra_pytest_markers="and changed" fi - cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu + merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch \ + PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ + tox -e gpu-cu11 tests-examples: runs-on: 1GPU diff --git a/tox.ini b/tox.ini index 3d624c88c5..1bfd4b2e17 100644 --- a/tox.ini +++ b/tox.ini @@ -37,6 +37,7 @@ commands = ; Runs GPU-based tests. allowlist_externals = bash + cp deps = -rrequirements/test.txt git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} @@ -49,6 +50,7 @@ setenv = TF_GPU_ALLOCATOR=cuda_malloc_async sitepackages=true commands = + bash -c 'cp $(python -c "import sys; print(sys.base_prefix)")/lib/*.so* $(python -c "import sys; print(sys.prefix)")/lib' bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)' From 487f9740c36872aba05b878e82f64e3a76de9c0c Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:28:25 +0100 Subject: [PATCH 07/27] Trigger GPU PR tests from push instead of pull_request --- .github/workflows/gpu.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 28b5bc797c..22ad9a59f1 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -3,12 +3,11 @@ name: gpu-ci on: workflow_dispatch: push: - branches: [main] + branches: + - main + - pull-request/* tags: - "v[0-9]+.[0-9]+.[0-9]+" - pull_request: - branches: [main] - types: [opened, synchronize, reopened] concurrency: group: ${{ github.workflow }}-${{ github.ref }} From 26252c5f3e9b503d7438929f337ed9d55196a7e6 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:30:03 +0100 Subject: [PATCH 08/27] Add RAPIDS P100 runner to list of self-hosted runners config --- .github/actionlint.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index e0fa30b4bf..f5f4e4c59c 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -3,3 +3,4 @@ self-hosted-runner: labels: - 1GPU - 2GPU + - linux-amd64-gpu-p100-latest-1 From 268bf20a5106dbdb0f09780688189c822ab468c6 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:10:41 +0100 Subject: [PATCH 09/27] Add fixture to cleanup dataloader --- tests/conftest.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 6e5daeb58b..386ea8884f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,6 +20,7 @@ import platform import warnings from pathlib import Path +from unittest.mock import patch import distributed import psutil @@ -27,6 +28,7 @@ from asvdb import BenchmarkInfo, utils from merlin.core.utils import Distributed +from merlin.dataloader.loader_base import LoaderBase from merlin.datasets.synthetic import generate_data from merlin.io import Dataset from merlin.models.utils import ci_utils @@ -145,3 +147,17 @@ def get_benchmark_info(): arch=uname.machine, ram="%d" % psutil.virtual_memory().total, ) + + +@pytest.fixture(scope="function", autouse=True) +def cleanup_dataloader(): + """After each test runs. Call .stop() on any dataloaders created during the test. + The avoids issues with background threads hanging around and interfering with subsequent tests. + This happens when a dataloader is partially consumed (not all batches are iterated through). + """ + with patch.object( + LoaderBase, "__iter__", side_effect=LoaderBase.__iter__, autospec=True + ) as patched: + yield + for call in patched.call_args_list: + call.args[0].stop() From a1515d78ca2b08b3053d1067abd874e4c152d280 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:10:57 +0100 Subject: [PATCH 10/27] Replace import of collections.Sequence with collections.abc.Sequence --- merlin/models/tf/core/tabular.py | 4 ++-- merlin/models/tf/inputs/embedding.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/merlin/models/tf/core/tabular.py b/merlin/models/tf/core/tabular.py index 33b1ed5b42..3f2bb8e574 100644 --- a/merlin/models/tf/core/tabular.py +++ b/merlin/models/tf/core/tabular.py @@ -1,5 +1,5 @@ import abc -import collections +import collections.abc import copy from typing import Dict, List, Optional, Sequence, Union, overload @@ -600,7 +600,7 @@ def get_config(self): def select_by_tag(self, tags: Tags) -> Optional["Filter"]: if isinstance(self.feature_names, Tags): schema = self.schema.select_by_tag(self.feature_names).select_by_tag(tags) - elif isinstance(self.feature_names, collections.Sequence): + elif isinstance(self.feature_names, collections.abc.Sequence): schema = self.schema.select_by_name(self.feature_names).select_by_tag(tags) else: raise RuntimeError( diff --git a/merlin/models/tf/inputs/embedding.py b/merlin/models/tf/inputs/embedding.py index aff30184a6..156ef974e8 100644 --- a/merlin/models/tf/inputs/embedding.py +++ b/merlin/models/tf/inputs/embedding.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import collections +import collections.abc import inspect from copy import deepcopy from dataclasses import dataclass @@ -268,7 +268,7 @@ def select_by_tag(self, tags: Union[Tags, Sequence[Tags]]) -> Optional["Embeddin ------- An EmbeddingTable if the tags match. If no features match, it returns None. """ - if not isinstance(tags, collections.Sequence): + if not isinstance(tags, collections.abc.Sequence): tags = [tags] selected_schema = self.schema.select_by_tag(tags) From aaafe21667cd6cf80ccc13211a67553fe9f3da06 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:20:28 +0100 Subject: [PATCH 11/27] Remove COMPARE_BRANCH from gpu.yml --- .github/workflows/gpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 352fcbce5c..ff9f2818a6 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -41,7 +41,7 @@ jobs: fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" MERLIN_BRANCH=$merlin_branch \ - COMPARE_BRANCH=${{ github.base_ref }} \ + COMPARE_BRANCH=$merlin_branch \ PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ tox -e gpu @@ -77,7 +77,7 @@ jobs: fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch \ - COMPARE_BRANCH=${{ github.base_ref }} \ + COMPARE_BRANCH=$merlin_branch \ PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ tox -e gpu-cu11 From 169fb2c4edbd44ecf4f3ff0ddf4fd8b39802dcb1 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:25:41 +0100 Subject: [PATCH 12/27] Update ref for branch-name action in gpu.yml --- .github/workflows/gpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index ff9f2818a6..1d0ab6e565 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -33,7 +33,7 @@ jobs: python -m pip install --upgrade pip tox - name: Get Branch name id: get-branch-name - uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request + uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - name: Run tests run: | if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then @@ -69,7 +69,7 @@ jobs: python -m pip install --upgrade pip tox - name: Get Branch name id: get-branch-name - uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request + uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - name: Run tests run: | if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then From 15ae84b9bd7ff27eead442ba6899a93e5cc12866 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:35:36 +0100 Subject: [PATCH 13/27] Run GPU examples in RAPIDS runner --- .github/workflows/gpu.yml | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 1d0ab6e565..6a4aaf0367 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -82,22 +82,36 @@ jobs: tox -e gpu-cu11 tests-examples: - runs-on: 1GPU - + runs-on: linux-amd64-gpu-p100-latest-1 + container: + image: nvidia/cuda:11.8.0-devel-ubuntu22.04 + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: - uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Install Ubuntu packages + run: | + apt-get update -y + # libcudnn8 installed for tensorflow GPU support + apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install and upgrade python packages + run: | + python -m pip install --upgrade pip tox + - name: Get Branch name + id: get-branch-name + uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - name: Run tests run: | - ref_type=${{ github.ref_type }} - branch=main - if [[ $ref_type == "tag"* ]] - then - git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release* - branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///') - fi if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then extra_pytest_markers="and changed" fi - cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu + merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ + PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" \ + tox -e gpu-cu11 From 1c514f4ab54d3f7ba6a3b6dbd0dc9107bc1c6658 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:45:22 +0100 Subject: [PATCH 14/27] Move branch env vars to one line --- .github/workflows/gpu.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 6a4aaf0367..3518f357ed 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -40,8 +40,7 @@ jobs: extra_pytest_markers="and changed" fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - MERLIN_BRANCH=$merlin_branch \ - COMPARE_BRANCH=$merlin_branch \ + MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ tox -e gpu @@ -76,8 +75,7 @@ jobs: extra_pytest_markers="and changed" fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch \ - COMPARE_BRANCH=$merlin_branch \ + RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ tox -e gpu-cu11 From 7aa74c043a07066f62bac62b12077e854a20f91b Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:55:30 +0100 Subject: [PATCH 15/27] Add pip cache --- .github/workflows/gpu.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 3518f357ed..85a13b9991 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -63,6 +63,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 + cache: 'pip' - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox @@ -98,6 +99,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 + cache: 'pip' - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox From fbb2e4068000f33f7c426c59194faebbb107c145 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 12:00:47 +0100 Subject: [PATCH 16/27] Replace single quotes in gpu.yml --- .github/workflows/gpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 85a13b9991..ab60b61cdd 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -63,7 +63,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - cache: 'pip' + cache: "pip" - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox @@ -99,7 +99,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - cache: 'pip' + cache: "pip" - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox From 90cff0ad76d13d49e3de9384414e6fa4f1bb1ca4 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 12:19:02 +0100 Subject: [PATCH 17/27] Use actions/cache for tox environment --- .github/workflows/gpu.yml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index ab60b61cdd..8280d7f82f 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -16,8 +16,11 @@ concurrency: jobs: gpu-ci: runs-on: linux-amd64-gpu-p100-latest-1 + strategy: + matrix: + image: ["nvcr.io/nvidia/tensorflow:23.06-tf2-py3"] container: - image: nvcr.io/nvidia/tensorflow:23.06-tf2-py3 + image: ${{ matrix.image }} env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: @@ -31,6 +34,10 @@ jobs: - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox + - uses: actions/cache@v3 + with: + path: .tox + key: tox-${{ matrix.image }}-${{ hashFiles('requirements/*.txt') }} - name: Get Branch name id: get-branch-name uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 @@ -63,7 +70,10 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - cache: "pip" + - uses: actions/cache@v3 + with: + path: .tox + key: tox-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox @@ -99,7 +109,10 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - cache: "pip" + - uses: actions/cache@v3 + with: + path: .tox + key: tox-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox From 55ff2b0312a1ed9c11b7273777c23ce34ce27045 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 12:27:40 +0100 Subject: [PATCH 18/27] Use id for setup-python step --- .github/workflows/gpu.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 8280d7f82f..9793f93625 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -67,13 +67,14 @@ jobs: # libcudnn8 installed for tensorflow GPU support apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' - name: Set up Python 3.8 + id: setup-python uses: actions/setup-python@v4 with: python-version: 3.8 - uses: actions/cache@v3 with: path: .tox - key: tox-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} + key: tox-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox @@ -106,13 +107,14 @@ jobs: # libcudnn8 installed for tensorflow GPU support apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' - name: Set up Python 3.8 + id: setup-python uses: actions/setup-python@v4 with: python-version: 3.8 - uses: actions/cache@v3 with: path: .tox - key: tox-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} + key: tox-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox From 63bc85a9ff4b4d77945343e13192254ca5b8ffd1 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 12:41:04 +0100 Subject: [PATCH 19/27] Replace double [[ with single [ --- .github/workflows/gpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 9793f93625..f2b7ba84e3 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -43,7 +43,7 @@ jobs: uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - name: Run tests run: | - if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then + if [ "${{ github.ref }}" != 'refs/heads/main' ]; then extra_pytest_markers="and changed" fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" @@ -123,7 +123,7 @@ jobs: uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - name: Run tests run: | - if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then + if [ "${{ github.ref }}" != 'refs/heads/main' ]; then extra_pytest_markers="and changed" fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" From 2ec75dcf4797634f295f6be27e086c76f38bf34a Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 13:01:09 +0100 Subject: [PATCH 20/27] Move checkout after ubuntu package install --- .github/workflows/gpu.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index f2b7ba84e3..0a7c65435f 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -58,14 +58,14 @@ jobs: env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - name: Install Ubuntu packages run: | apt-get update -y # libcudnn8 installed for tensorflow GPU support apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' + - uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Set up Python 3.8 id: setup-python uses: actions/setup-python@v4 @@ -98,14 +98,14 @@ jobs: env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - name: Install Ubuntu packages run: | apt-get update -y # libcudnn8 installed for tensorflow GPU support apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' + - uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Set up Python 3.8 id: setup-python uses: actions/setup-python@v4 From 7d9de2c35b4f6815031ec38440fa549cf1801e7e Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 13:30:47 +0100 Subject: [PATCH 21/27] Use cuda runtime base image instead of devel --- .github/workflows/gpu.yml | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 0a7c65435f..8c6b2c515e 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -27,10 +27,6 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - - name: Install Ubuntu packages - run: | - apt-get update -y - apt-get install -y lsb-release - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox @@ -54,15 +50,17 @@ jobs: gpu-cu11: runs-on: linux-amd64-gpu-p100-latest-1 container: - image: nvidia/cuda:11.8.0-devel-ubuntu22.04 + image: nvidia/cuda:11.8.0-runtime-ubuntu22.04 env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: - name: Install Ubuntu packages run: | apt-get update -y - # libcudnn8 installed for tensorflow GPU support - apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' + apt-get install -y \ + git \ + 'libcudnn8=*cuda11.8' `# tensorflow GPU support` \ + cuda-nvcc-11-8 `# required for numba` - uses: actions/checkout@v3 with: fetch-depth: 0 @@ -83,7 +81,7 @@ jobs: uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - name: Run tests run: | - if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then + if [ "${{ github.ref }}" != 'refs/heads/main' ]; then extra_pytest_markers="and changed" fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" @@ -94,7 +92,7 @@ jobs: tests-examples: runs-on: linux-amd64-gpu-p100-latest-1 container: - image: nvidia/cuda:11.8.0-devel-ubuntu22.04 + image: nvidia/cuda:11.8.0-runtime-ubuntu22.04 env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: @@ -102,7 +100,10 @@ jobs: run: | apt-get update -y # libcudnn8 installed for tensorflow GPU support - apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' + apt-get install -y \ + git \ + 'libcudnn8=*cuda11.8' `# tensorflow GPU support` \ + cuda-nvcc-11-8 `# required for numba` - uses: actions/checkout@v3 with: fetch-depth: 0 From 8c04544b75480c2306e5ac2f1915edc2ec9f93bb Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 15:33:46 +0100 Subject: [PATCH 22/27] Move matrix configuration to map --- .github/workflows/gpu.yml | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 8c6b2c515e..f766705268 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -49,10 +49,15 @@ jobs: gpu-cu11: runs-on: linux-amd64-gpu-p100-latest-1 + env: + IMAGE: "nvidia/cuda:11.8.0-runtime-ubuntu22.04" container: - image: nvidia/cuda:11.8.0-runtime-ubuntu22.04 + image: ${{ env.IMAGE }} env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + strategy: + matrix: + versions: [ {rapids: "23.04", python: "3.8"} ] steps: - name: Install Ubuntu packages run: | @@ -64,15 +69,15 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - - name: Set up Python 3.8 + - name: Set up Python ${{ matrix.version.python }} id: setup-python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: ${{ matrix.version.python }} - uses: actions/cache@v3 with: path: .tox - key: tox-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} + key: tox-${{ matrix.IMAGE }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox @@ -85,16 +90,21 @@ jobs: extra_pytest_markers="and changed" fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ + RAPIDS_VERSION=${{ matrix.version.rapids }} MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ tox -e gpu-cu11 tests-examples: + env: + IMAGE: "nvidia/cuda:11.8.0-runtime-ubuntu22.04" runs-on: linux-amd64-gpu-p100-latest-1 container: - image: nvidia/cuda:11.8.0-runtime-ubuntu22.04 + image: ${{ env.IMAGE }} env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + strategy: + matrix: + versions: [ {rapids: "23.04", python: "3.8"} ] steps: - name: Install Ubuntu packages run: | @@ -107,15 +117,15 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - - name: Set up Python 3.8 + - name: Set up Python ${{ matrix.version.python }} id: setup-python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: ${{ matrix.version.python }} - uses: actions/cache@v3 with: path: .tox - key: tox-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} + key: tox-${{ env.IMAGE }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox @@ -128,6 +138,6 @@ jobs: extra_pytest_markers="and changed" fi merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ + RAPIDS_VERSION=${{ matrix.version.rapids }} MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" \ tox -e gpu-cu11 From 1d428a0665356066a874985a617612af38210aa7 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 16:14:08 +0100 Subject: [PATCH 23/27] Update formatting of gpu.yml --- .github/workflows/gpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index f766705268..a3e8371f02 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -57,7 +57,7 @@ jobs: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} strategy: matrix: - versions: [ {rapids: "23.04", python: "3.8"} ] + versions: [{ rapids: "23.04", python: "3.8" }] steps: - name: Install Ubuntu packages run: | @@ -104,7 +104,7 @@ jobs: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} strategy: matrix: - versions: [ {rapids: "23.04", python: "3.8"} ] + versions: [{ rapids: "23.04", python: "3.8" }] steps: - name: Install Ubuntu packages run: | From 8e96553bcf8a67bb7ca799306a2fd398c9210732 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 20:47:43 +0100 Subject: [PATCH 24/27] Test against different DLFW versions --- .github/workflows/gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index a3e8371f02..87bf98fd35 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -18,7 +18,7 @@ jobs: runs-on: linux-amd64-gpu-p100-latest-1 strategy: matrix: - image: ["nvcr.io/nvidia/tensorflow:23.06-tf2-py3"] + image: ["nvcr.io/nvidia/tensorflow:23.02-tf2-py3", "nvcr.io/nvidia/tensorflow:23.04-tf2-py3", "nvcr.io/nvidia/tensorflow:23.06-tf2-py3"] container: image: ${{ matrix.image }} env: From 1cd702f910065d6e958dfe42389606ea948b68ba Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 20:48:15 +0100 Subject: [PATCH 25/27] disable cu11 test --- .github/workflows/gpu.yml | 92 +++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 87bf98fd35..9fd8b5a4cc 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -47,52 +47,52 @@ jobs: PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ tox -e gpu - gpu-cu11: - runs-on: linux-amd64-gpu-p100-latest-1 - env: - IMAGE: "nvidia/cuda:11.8.0-runtime-ubuntu22.04" - container: - image: ${{ env.IMAGE }} - env: - NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} - strategy: - matrix: - versions: [{ rapids: "23.04", python: "3.8" }] - steps: - - name: Install Ubuntu packages - run: | - apt-get update -y - apt-get install -y \ - git \ - 'libcudnn8=*cuda11.8' `# tensorflow GPU support` \ - cuda-nvcc-11-8 `# required for numba` - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Set up Python ${{ matrix.version.python }} - id: setup-python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.version.python }} - - uses: actions/cache@v3 - with: - path: .tox - key: tox-${{ matrix.IMAGE }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} - - name: Install and upgrade python packages - run: | - python -m pip install --upgrade pip tox - - name: Get Branch name - id: get-branch-name - uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - - name: Run tests - run: | - if [ "${{ github.ref }}" != 'refs/heads/main' ]; then - extra_pytest_markers="and changed" - fi - merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - RAPIDS_VERSION=${{ matrix.version.rapids }} MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ - PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ - tox -e gpu-cu11 + # gpu-cu11: + # runs-on: linux-amd64-gpu-p100-latest-1 + # env: + # IMAGE: "nvidia/cuda:11.8.0-runtime-ubuntu22.04" + # container: + # image: ${{ env.IMAGE }} + # env: + # NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + # strategy: + # matrix: + # versions: [{ rapids: "23.04", python: "3.8" }] + # steps: + # - name: Install Ubuntu packages + # run: | + # apt-get update -y + # apt-get install -y \ + # git \ + # 'libcudnn8=*cuda11.8' `# tensorflow GPU support` \ + # cuda-nvcc-11-8 `# required for numba` + # - uses: actions/checkout@v3 + # with: + # fetch-depth: 0 + # - name: Set up Python ${{ matrix.version.python }} + # id: setup-python + # uses: actions/setup-python@v4 + # with: + # python-version: ${{ matrix.version.python }} + # - uses: actions/cache@v3 + # with: + # path: .tox + # key: tox-${{ matrix.IMAGE }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} + # - name: Install and upgrade python packages + # run: | + # python -m pip install --upgrade pip tox + # - name: Get Branch name + # id: get-branch-name + # uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 + # - name: Run tests + # run: | + # if [ "${{ github.ref }}" != 'refs/heads/main' ]; then + # extra_pytest_markers="and changed" + # fi + # merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + # RAPIDS_VERSION=${{ matrix.version.rapids }} MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ + # PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ + # tox -e gpu-cu11 tests-examples: env: From 2927bb4236e310e2f981a5f59983a3e8227ea7b2 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 20:57:55 +0100 Subject: [PATCH 26/27] reformat gpu.yml --- .github/workflows/gpu.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 9fd8b5a4cc..85d097890a 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -18,7 +18,12 @@ jobs: runs-on: linux-amd64-gpu-p100-latest-1 strategy: matrix: - image: ["nvcr.io/nvidia/tensorflow:23.02-tf2-py3", "nvcr.io/nvidia/tensorflow:23.04-tf2-py3", "nvcr.io/nvidia/tensorflow:23.06-tf2-py3"] + image: + [ + "nvcr.io/nvidia/tensorflow:23.02-tf2-py3", + "nvcr.io/nvidia/tensorflow:23.04-tf2-py3", + "nvcr.io/nvidia/tensorflow:23.06-tf2-py3", + ] container: image: ${{ matrix.image }} env: @@ -104,7 +109,7 @@ jobs: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} strategy: matrix: - versions: [{ rapids: "23.04", python: "3.8" }] + version: [{ rapids: "23.04", python: "3.8" }] steps: - name: Install Ubuntu packages run: | From 8cd55375af28d668a422f179dd1e2d9f0820c243 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 3 Jul 2023 21:05:08 +0100 Subject: [PATCH 27/27] Remove image env --- .github/workflows/gpu.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 85d097890a..58ff8c862e 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -100,11 +100,9 @@ jobs: # tox -e gpu-cu11 tests-examples: - env: - IMAGE: "nvidia/cuda:11.8.0-runtime-ubuntu22.04" runs-on: linux-amd64-gpu-p100-latest-1 container: - image: ${{ env.IMAGE }} + image: "nvidia/cuda:11.8.0-runtime-ubuntu22.04" env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} strategy: @@ -130,7 +128,7 @@ jobs: - uses: actions/cache@v3 with: path: .tox - key: tox-${{ env.IMAGE }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} + key: tox-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox