Merge remote-tracking branch 'NVIDIA/main' into arr_reshape

NVIDIA · Nov 29, 2024 · 0a0eb35 · 0a0eb35
2 parents 7928f2d + c04efe0
commit 0a0eb35
Show file tree

Hide file tree

Showing 37 changed files with 1,614 additions and 78 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -19,8 +19,10 @@ jobs:
       - compute-matrix
       - build-conda
       - test-conda
+      - test-conda-pynvjitlink
       - build-wheels
       - test-wheels
+      - test-wheels-pynvjitlink
       - build-docs
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
@@ -57,6 +59,16 @@ jobs:
       script: "ci/test_conda.sh"
       run_codecov: false
       matrix_filter: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
+  test-conda-pynvjitlink:
+    needs:
+      - build-conda
+      - compute-matrix
+    uses: ./.github/workflows/conda-python-tests.yaml
+    with:
+      build_type: pull-request
+      script: "ci/test_conda_pynvjitlink.sh"
+      run_codecov: false
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.11"))
   build-wheels:
     needs:
       - compute-matrix
@@ -71,7 +83,15 @@ jobs:
     uses: ./.github/workflows/wheels-test.yaml
     with:
       build_type: pull-request
-      script: "ci/test_wheel.sh"
+      script: "ci/test_wheel.sh false"
+  test-wheels-pynvjitlink:
+    needs:
+      - build-wheels
+    uses: ./.github/workflows/wheels-test.yaml
+    with:
+      build_type: pull-request
+      script: "ci/test_wheel_pynvjitlink.sh"
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "12.5.1" and .PY_VER == "3.12"))
   build-docs:
     needs:
       - build-conda

diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@ __pycache__
 build
 .*.swp
 *.so
+numba_cuda/numba/cuda/tests/cudadrv/test_device_functions.*
+numba_cuda/numba/cuda/tests/cudadrv/undefined_extern.*
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 if [ "${CUDA_VER%.*.*}" = "11" ]; then
   CTK_PACKAGES="cudatoolkit"
 else
-  CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc"
+  CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc"
 fi
 
 rapids-logger "Install testing dependencies"

diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION
+
+set -euo pipefail
+
+. /opt/conda/etc/profile.d/conda.sh
+
+if [ "${CUDA_VER%.*.*}" = "11" ]; then
+  CTK_PACKAGES="cudatoolkit"
+else
+  CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc"
+fi
+
+rapids-logger "Install testing dependencies"
+# TODO: Replace with rapids-dependency-file-generator
+rapids-mamba-retry create -n test \
+    c-compiler \
+    cxx-compiler \
+    ${CTK_PACKAGES} \
+    cuda-python \
+    cuda-version=${CUDA_VER%.*} \
+    make \
+    psutil \
+    pytest \
+    python=${RAPIDS_PY_VERSION}
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate test
+set -u
+
+rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
+
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
+mkdir -p "${RAPIDS_TESTS_DIR}"
+pushd "${RAPIDS_TESTS_DIR}"
+
+rapids-print-env
+
+rapids-logger "Check GPU usage"
+nvidia-smi
+
+rapids-logger "Show Numba system info"
+python -m numba --sysinfo
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
+
+rapids-logger "Install pynvjitlink"
+set +u
+rapids-mamba-retry install -c rapidsai pynvjitlink
+set -u
+
+rapids-logger "Build tests"
+
+PY_SCRIPT="
+import numba_cuda
+root = numba_cuda.__file__.rstrip('__init__.py')
+test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
+print(test_dir)
+"
+
+NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$PY_SCRIPT")
+pushd $NUMBA_CUDA_TEST_BIN_DIR
+make
+popd
+
+
+rapids-logger "Run Tests"
+NUMBA_CUDA_ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v
+
+popd
+
+rapids-logger "Test script exiting with value: $EXITCODE"
+exit ${EXITCODE}
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
@@ -8,6 +8,7 @@ rapids-logger "Install testing dependencies"
 python -m pip install \
     psutil \
     cuda-python \
+    nvidia-cuda-cccl-cu12 \
     pytest
 
 rapids-logger "Install wheel"

diff --git a/ci/test_wheel_pynvjitlink.sh b/ci/test_wheel_pynvjitlink.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION
+
+set -euo pipefail
+
+rapids-logger "Install testing dependencies"
+# TODO: Replace with rapids-dependency-file-generator
+python -m pip install \
+    psutil \
+    cuda-python \
+    pytest
+
+rapids-logger "Install pynvjitlink"
+python -m pip install pynvjitlink-cu12
+
+rapids-logger "Build tests"
+PY_SCRIPT="
+import numba_cuda
+root = numba_cuda.__file__.rstrip('__init__.py')
+test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
+print(test_dir)
+"
+
+NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$PY_SCRIPT")
+pushd $NUMBA_CUDA_TEST_BIN_DIR
+make
+popd
+
+rapids-logger "Install wheel"
+package=$(realpath wheel/numba_cuda*.whl)
+echo "Package path: $package"
+python -m pip install $package
+
+rapids-logger "Check GPU usage"
+nvidia-smi
+
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
+mkdir -p "${RAPIDS_TESTS_DIR}"
+pushd "${RAPIDS_TESTS_DIR}"
+
+rapids-logger "Show Numba system info"
+python -m numba --sysinfo
+
+rapids-logger "Run Tests"
+NUMBA_CUDA_ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v
+
+popd
diff --git a/docs/source/reference/envvars.rst b/docs/source/reference/envvars.rst
@@ -120,4 +120,14 @@ target.
    ``/usr/local/cuda/include``. On Windows, the default is
    ``$env:CUDA_PATH\include``.
 
+.. envvar:: NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY
 
+      Enable minor version compatibility for the CUDA driver. Requires the
+      ``cubinlinker`` and ``ptxcompiler`` packages to be installed. Provides minor
+      version compatibility for driver versions less than 12.0.
+
+.. envvar:: NUMBA_CUDA_ENABLE_PYNVJITLINK
+
+   Use ``pynvjitlink`` for minor version compatibility. Requires the ``pynvjitlink``
+   package to be installed. Provides minor version compatibility for driver versions
+   greater than 12.0.
diff --git a/docs/source/user/minor_version_compatibility.rst b/docs/source/user/minor_version_compatibility.rst
@@ -65,7 +65,8 @@ MVC support is enabled by setting the environment variable:
 
 .. code:: bash
 
-   export NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY=1
+   export NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY=1 # CUDA 11
+   export NUMBA_CUDA_ENABLE_PYNVJITLINK=1 # CUDA 12
 
 
 or by setting a configuration variable prior to using any CUDA functionality in
@@ -74,7 +75,8 @@ Numba:
 .. code:: python
 
    from numba import config
-   config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = True
+   config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = True # CUDA 11
+   config.CUDA_ENABLE_PYNVJITLINK = True # CUDA 12
 
 
 References

diff --git a/numba_cuda/VERSION b/numba_cuda/VERSION
@@ -1 +1 @@
-0.0.17
+0.0.18
diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
@@ -59,8 +59,15 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
     get_cufunc), which may be of different compute capabilities.
     """
 
-    def __init__(self, codegen, name, entry_name=None, max_registers=None,
-                 nvvm_options=None):
+    def __init__(
+        self,
+        codegen,
+        name,
+        entry_name=None,
+        max_registers=None,
+        lto=False,
+        nvvm_options=None
+    ):
         """
         codegen:
             Codegen object.
@@ -71,6 +78,8 @@ def __init__(self, codegen, name, entry_name=None, max_registers=None,
             kernel and not a device function.
         max_registers:
             The maximum register usage to aim for when linking.
+        lto:
+            Whether to enable link-time optimization.
         nvvm_options:
                 Dict of options to pass to NVVM.
         """
@@ -103,6 +112,7 @@ def __init__(self, codegen, name, entry_name=None, max_registers=None,
         self._cufunc_cache = {}
 
         self._max_registers = max_registers
+        self._lto = lto
         if nvvm_options is None:
             nvvm_options = {}
         self._nvvm_options = nvvm_options
@@ -178,7 +188,9 @@ def get_cubin(self, cc=None):
         if cubin:
             return cubin
 
-        linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
+        linker = driver.Linker.new(
+            max_registers=self._max_registers, cc=cc, lto=self._lto
+        )
 
         if linker.lto:
             ltoir = self.get_ltoir(cc=cc)

diff --git a/numba_cuda/numba/cuda/cuda_paths.py b/numba_cuda/numba/cuda/cuda_paths.py
@@ -2,9 +2,11 @@
 import re
 import os
 from collections import namedtuple
+import platform
 
 from numba.core.config import IS_WIN32
 from numba.misc.findlib import find_lib, find_file
+from numba import config
 
 
 _env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
@@ -241,6 +243,7 @@ def get_cuda_paths():
             'libdevice': _get_libdevice_paths(),
             'cudalib_dir': _get_cudalib_dir(),
             'static_cudalib_dir': _get_static_cudalib_dir(),
+            'include_dir': _get_include_dir(),
         }
         # Cache result
         get_cuda_paths._cached_result = d
@@ -256,3 +259,68 @@ def get_debian_pkg_libdevice():
     if not os.path.exists(pkg_libdevice_location):
         return None
     return pkg_libdevice_location
+
+
+def get_current_cuda_target_name():
+    """Determine conda's CTK target folder based on system and machine arch.
+
+    CTK's conda package delivers headers based on its architecture type. For example,
+    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
+    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
+    nuances at cudart's conda feedstock:
+    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
+    """
+    system = platform.system()
+    machine = platform.machine()
+
+    if system == "Linux":
+        arch_to_targets = {
+            'x86_64': 'x86_64-linux',
+            'aarch64': 'sbsa-linux'
+        }
+    elif system == "Windows":
+        arch_to_targets = {
+            'AMD64': 'x64',
+        }
+    else:
+        arch_to_targets = {}
+
+    return arch_to_targets.get(machine, None)
+
+
+def get_conda_include_dir():
+    """
+    Return the include directory in the current conda environment, if one
+    is active and it exists.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    if not is_conda_env:
+        return
+
+    if platform.system() == "Windows":
+        include_dir = os.path.join(
+            sys.prefix, 'Library', 'include'
+        )
+    elif target_name := get_current_cuda_target_name():
+        include_dir = os.path.join(
+            sys.prefix, 'targets', target_name, 'include'
+        )
+    else:
+        # A fallback when target cannot determined
+        # though usually it shouldn't.
+        include_dir = os.path.join(sys.prefix, 'include')
+
+    if os.path.exists(include_dir):
+        return include_dir
+    return
+
+
+def _get_include_dir():
+    """Find the root include directory."""
+    options = [
+        ('Conda environment (NVIDIA package)', get_conda_include_dir()),
+        ('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH),
+        # TODO: add others
+    ]
+    by, include_dir = _find_valid_path(options)
+    return _env_path_tuple(by, include_dir)
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -876,7 +876,10 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
             sentry_contiguous(obj)
             devobj = from_array_like(obj, stream=stream)
         if copy:
-            if config.CUDA_WARN_ON_IMPLICIT_COPY:
+            if (
+                config.CUDA_WARN_ON_IMPLICIT_COPY
+                and not config.DISABLE_PERFORMANCE_WARNINGS
+            ):
                 if (
                     not user_explicit and
                     (not isinstance(obj, DeviceNDArray)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,5 @@ __pycache__ @@
     build
     .*.swp
     *.so
+    numba_cuda/numba/cuda/tests/cudadrv/test_device_functions.*
+    numba_cuda/numba/cuda/tests/cudadrv/undefined_extern.*