NVIDIA · gmarkall · Feb 26, 2025 · Feb 26, 2025 · isVoid · Mar 2, 2025
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -5,7 +5,7 @@
 import ctypes
 import functools
 
-from numba.core import config, serialize, sigutils, types, typing, utils
+from numba.core import config, ir, serialize, sigutils, types, typing, utils
 from numba.core.caching import Cache, CacheImpl
 from numba.core.compiler_lock import global_compiler_lock
 from numba.core.dispatcher import Dispatcher
@@ -42,6 +42,47 @@
 reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape']
 
 
+def get_cres_link_objects(cres):
+    """Given a compile result, return a set of all linkable code objects that
+    are required for it to be fully linked."""
+
+    link_objects = set()
+
+    # The typemap of the function includes calls, so we can traverse it to find
+    # the references we need.
+    for name, v in cres.fndesc.typemap.items():
+
+        # CUDADispatchers represent a call to a device function, so we need to
+        # look up the linkable code for those recursively.
+        if isinstance(v, cuda_types.CUDADispatcher):
+            # We need to locate the signature of the call so we can find the
+            # correct overload.
+            for call, sig in cres.fndesc.calltypes.items():
+                if isinstance(call, ir.Expr) and call.op == 'call':
+                    # There will likely be multiple calls in the typemap; we
+                    # can uniquely identify the relevant one using its SSA
+                    # name.
+                    if call.func.name == name:
+                        called_cres = v.dispatcher.overloads[sig.args]
+                        called_link_objects = get_cres_link_objects(called_cres)
+                        link_objects.update(called_link_objects)
+
+        # From this point onwards, we are only interested in ExternFunction
+        # declarations - these are the calls made directly in this function to
+        # them.
+
+        if not isinstance(v, Function):
+            continue
+
+        if not isinstance(v.typing_key, ExternFunction):
+            continue
+
+        for obj in v.typing_key.link:
+            link_objects.add(obj)
+
+    return link_objects
+
+
 class _Kernel(serialize.ReduceMixin):
     '''
     CUDA Kernel specialized for a given set of argument types. When called, this
@@ -159,15 +200,8 @@ def link_to_library_functions(library_functions, library_path,
 
         self.maybe_link_nrt(link, tgt_ctx, asm)
 
-        for k, v in cres.fndesc.typemap.items():
-            if not isinstance(v, Function):
-                continue
-
-            if not isinstance(v.typing_key, ExternFunction):
-                continue
-
-            for obj in v.typing_key.link:
-                lib.add_linking_file(obj)
+        for obj in get_cres_link_objects(cres):
+            lib.add_linking_file(obj)
 
         for filepath in link:
             lib.add_linking_file(filepath)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
@@ -205,6 +205,14 @@ def rgba_caller(x, channels):
 }
 """)
 
+times3_cu = cuda.CUSource("""
+extern "C" __device__
+int times3(int *out, int a)
+{
+  *out = a * 3;
+  return 0;
+}
+""")
 
 times4_cu = cuda.CUSource("""
 extern "C" __device__
@@ -351,6 +359,123 @@ def kernel(x, seed):
         kernel[1, 1](x, 1)
         np.testing.assert_equal(x[0], 323845807)
 
+    def test_declared_in_called_function(self):
+        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+
+        @cuda.jit
+        def device_func(x):
+            return times2(x)
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = device_func(x[i])
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+
+        kernel[1, 32](r, x)
+
+        np.testing.assert_equal(r, x * 2)
+
+    def test_declared_in_called_function_twice(self):
+        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+
+        @cuda.jit
+        def device_func_1(x):
+            return times2(x)
+
+        @cuda.jit
+        def device_func_2(x):
+            return device_func_1(x)
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = device_func_2(x[i])
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+
+        kernel[1, 32](r, x)
+
+        np.testing.assert_equal(r, x * 2)
+
+    def test_declared_in_called_function_two_calls(self):
+        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+
+        @cuda.jit
+        def device_func(x):
+            return times2(x)
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = device_func(x[i]) + device_func(x[i] + i)
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+
+        kernel[1, 32](r, x)
+
+        np.testing.assert_equal(r, x * 6)
+
+    def test_call_declared_function_twice(self):
+        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = times2(x[i]) + times2(x[i] + i)
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+
+        kernel[1, 32](r, x)
+
+        np.testing.assert_equal(r, x * 6)
+
+    def test_declared_in_called_function_and_parent(self):
+        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+
+        @cuda.jit
+        def device_func(x):
+            return times2(x)
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = device_func(x[i]) + times2(x[i])
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+
+        kernel[1, 32](r, x)
+
+        np.testing.assert_equal(r, x * 4)
+
+    def test_call_two_different_declared_functions(self):
+        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+        times3 = cuda.declare_device('times3', 'int32(int32)', link=times3_cu)
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = times2(x[i]) + times3(x[i])
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+
+        kernel[1, 32](r, x)
+
+        np.testing.assert_equal(r, x * 5)
+
 
 if __name__ == '__main__':
     unittest.main()