diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 7a137c68fe..20fade2647 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -1,71 +1,146 @@ -name: GPU CI +name: gpu-ci on: workflow_dispatch: push: branches: - main - - "pull-request/[0-9]+" + - pull-request/* tags: - "v[0-9]+.[0-9]+.[0-9]+" +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: gpu-ci: runs-on: linux-amd64-gpu-p100-latest-1 + strategy: + matrix: + image: + [ + "nvcr.io/nvidia/tensorflow:23.02-tf2-py3", + "nvcr.io/nvidia/tensorflow:23.04-tf2-py3", + "nvcr.io/nvidia/tensorflow:23.06-tf2-py3", + ] container: - image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest + image: ${{ matrix.image }} env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} - options: --shm-size=1G - credentials: - username: $oauthtoken - password: ${{ secrets.NGC_TOKEN }} - steps: - uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Install and upgrade python packages + run: | + python -m pip install --upgrade pip tox + - uses: actions/cache@v3 + with: + path: .tox + key: tox-${{ matrix.image }}-${{ hashFiles('requirements/*.txt') }} + - name: Get Branch name + id: get-branch-name + uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - name: Run tests run: | - nvidia-smi - pip install tox - ref_type=${{ github.ref_type }} - branch=main - if [[ $ref_type == "tag"* ]] - then - git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release* - branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///') - fi - if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then + if [ "${{ github.ref }}" != 'refs/heads/main' ]; then extra_pytest_markers="and changed" fi - PYTEST_MARKERS="unit and not (examples or integration or notebook) and (singlegpu or not multigpu) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu + merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ + PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ + tox -e gpu + + # gpu-cu11: + # runs-on: linux-amd64-gpu-p100-latest-1 + # env: + # IMAGE: "nvidia/cuda:11.8.0-runtime-ubuntu22.04" + # container: + # image: ${{ env.IMAGE }} + # env: + # NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + # strategy: + # matrix: + # versions: [{ rapids: "23.04", python: "3.8" }] + # steps: + # - name: Install Ubuntu packages + # run: | + # apt-get update -y + # apt-get install -y \ + # git \ + # 'libcudnn8=*cuda11.8' `# tensorflow GPU support` \ + # cuda-nvcc-11-8 `# required for numba` + # - uses: actions/checkout@v3 + # with: + # fetch-depth: 0 + # - name: Set up Python ${{ matrix.version.python }} + # id: setup-python + # uses: actions/setup-python@v4 + # with: + # python-version: ${{ matrix.version.python }} + # - uses: actions/cache@v3 + # with: + # path: .tox + # key: tox-${{ matrix.IMAGE }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} + # - name: Install and upgrade python packages + # run: | + # python -m pip install --upgrade pip tox + # - name: Get Branch name + # id: get-branch-name + # uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 + # - name: Run tests + # run: | + # if [ "${{ github.ref }}" != 'refs/heads/main' ]; then + # extra_pytest_markers="and changed" + # fi + # merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + # RAPIDS_VERSION=${{ matrix.version.rapids }} MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ + # PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \ + # tox -e gpu-cu11 - gpu-ci-examples: + tests-examples: runs-on: linux-amd64-gpu-p100-latest-1 container: - image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest + image: "nvidia/cuda:11.8.0-runtime-ubuntu22.04" env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} - options: --shm-size=1G - credentials: - username: $oauthtoken - password: ${{ secrets.NGC_TOKEN }} + strategy: + matrix: + version: [{ rapids: "23.04", python: "3.8" }] steps: + - name: Install Ubuntu packages + run: | + apt-get update -y + # libcudnn8 installed for tensorflow GPU support + apt-get install -y \ + git \ + 'libcudnn8=*cuda11.8' `# tensorflow GPU support` \ + cuda-nvcc-11-8 `# required for numba` - uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Set up Python ${{ matrix.version.python }} + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.version.python }} + - uses: actions/cache@v3 + with: + path: .tox + key: tox-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }} + - name: Install and upgrade python packages + run: | + python -m pip install --upgrade pip tox + - name: Get Branch name + id: get-branch-name + uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3 - name: Run tests run: | - pip install tox - ref_type=${{ github.ref_type }} - branch=main - if [[ $ref_type == "tag"* ]] - then - git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release* - branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///') - fi - if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then + if [ "${{ github.ref }}" != 'refs/heads/main' ]; then extra_pytest_markers="and changed" fi - PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu + merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + RAPIDS_VERSION=${{ matrix.version.rapids }} MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \ + PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" \ + tox -e gpu-cu11 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index 91c50ffa24..b013591b21 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,6 @@ -r dev.txt -r pytorch.txt -r tensorflow.txt +-r transformers.txt numpy<1.24 diff --git a/tox.ini b/tox.ini index e35a7dcc78..2e4a07fd40 100644 --- a/tox.ini +++ b/tox.ini @@ -2,18 +2,42 @@ ; .github/workflows/cpu-ci.yml for the workflow definition. [tox] -envlist = gpu,multi-gpu,horovod-cpu,nvtabular-cpu,systems-cpu,transformers4rec-cpu,docs,docs-multi +envlist = gpu,gpu-cu11,multi-gpu,horovod-cpu,nvtabular-cpu,systems-cpu,transformers4rec-cpu,docs,docs-multi [testenv] commands = pip install --upgrade pip pip install -e .[all] +[testenv:gpu-cu11] +; Runs in: GitHub Actions +; Runs GPU-based tests. +setenv = + TF_GPU_ALLOCATOR=cuda_malloc_async + PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +allowlist_externals = + bash +passenv = + CUDA_VISIBLE_DEVICES +deps = + -rrequirements/test.txt + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH} + nvidia-cudnn-cu11~=8.6.0 + cudf-cu11=={env:RAPIDS_VERSION} + dask-cudf-cu11=={env:RAPIDS_VERSION} +commands = + bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)' + [testenv:gpu] ; Runs in: Github Actions ; Runs GPU-based tests. allowlist_externals = bash + cp deps = -rrequirements/test.txt git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} @@ -26,6 +50,8 @@ setenv = TF_GPU_ALLOCATOR=cuda_malloc_async sitepackages=true commands = + ; copy system libs into virtualenv path (e.g. XGBoost) + bash -c 'cp $(python -c "import sys; print(sys.base_prefix)")/lib/*.so* $(python -c "import sys; print(sys.prefix)")/lib' bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)' [testenv:horovod-gpu]