NVIDIA · brandon-b-miller · Feb 21, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
@@ -246,10 +246,11 @@ def get_cufunc(self):
             return cufunc
 
         cubin = self.get_cubin(cc=device.compute_capability)
-        module = ctx.create_module_image(cubin)
+        #module = ctx.create_module_image(cubin)
 
         # Load
-        cufunc = module.get_function(self._entry_name)
+        #cufunc = module.get_function(self._entry_name)
+        cufunc = cubin.get_kernel(self._entry_name)
 
         # Populate caches
         self._cufunc_cache[device.id] = cufunc

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -32,6 +32,14 @@
 import numpy as np
 from collections import namedtuple, deque
 
+from cuda.core.experimental import (
+    Linker as _CUDALinker,
+    LinkerOptions as _CUDALinkerOptions,
+    ObjectCode,
+    Program,
+    ProgramOptions
+)
+
 from numba import mviewbuf
 from numba.core import utils, serialize, config
 from .error import CudaSupportError, CudaDriverError
@@ -2597,10 +2605,11 @@ def new(cls,
             linker = PyNvJitLinker
 
         elif config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY:
+            # TODO - who handles MVC now?
             linker = MVCLinker
         else:
             if USE_NV_BINDING:
-                linker = CudaPythonLinker
+                linker = CUDALinker
             else:
                 linker = CtypesLinker
 
@@ -2637,7 +2646,6 @@ def add_cu(self, cu, name):
         with driver.get_active_context() as ac:
             dev = driver.get_device(ac.devnum)
             cc = dev.compute_capability
-
         ptx, log = nvrtc.compile(cu, name, cc)
 
         if config.DUMP_ASSEMBLY:
@@ -2749,6 +2757,103 @@ def complete(self):
         """
 
 
+class CUDALinker(Linker):
+    def __init__(self, max_registers=None, lineinfo=False, cc=None):
+        arch = f"sm_{cc[0] * 10 + cc[1]}"
+        self.options = _CUDALinkerOptions(
+            max_register_count=max_registers,
+            lineinfo=lineinfo,
+            arch=arch
+        )
+
+        self.max_registers = max_registers
+        self.lineinfo = lineinfo
+        self.cc = cc
+        self.arch = arch
+        self.lto = False
+
+        self._complete = False
+        self._object_codes = []
+        self.linker = None # need at least one program
+
+    @property
+    def info_log(self):
+        if not self.linker:
+            raise ValueError("Not Initialized")
+        return self.linker.get_info_log()
+
+    @property
+    def error_log(self):
+        if not self.linker:
+            raise ValueError("Not Initialized")
+        return self.linker.get_error_log()
+
+    def add_ptx(self, ptx, name='<cudapy-ptx>'):
+        prog = Program(
+            ptx.decode('utf-8'),
+            'ptx',
+            ProgramOptions(
+                arch=self.arch,
+                lineinfo=self.lineinfo,
+                max_register_count=self.max_registers
+            )
+        )
+
+        # calls Linker.link() internally?
+        obj = prog.compile('cubin')
+        self._complete = True
+        self._linked = obj
+        self.linker = prog._linker
+
+    def add_cu(self, cu, name='<cudapy-cu>'):
+        prog = Program(
+            cu.decode('utf-8'),
+            'c++',
+            ProgramOptions(
+                arch=self.arch,
+                lineinfo=self.lineinfo,
+                max_register_count=self.max_registers
+            )
+        )
+        obj = prog.compile('ptx')
+        self._object_codes.append(obj)
+        prog.close()
+
+    def add_cubin(self, cubin, name='<cudapy-cubin>'):
+        obj = ObjectCode.from_cubin(cubin)
+        self._object_codes.append(obj)
+
+    def add_file(self, path, kind):
+        try:
+            with open(path, 'rb') as f:
+                data = f.read()
+        except FileNotFoundError:
+            raise LinkerError(f'{path} not found')
+
+        name = pathlib.Path(path).name
+        if kind == FILE_EXTENSION_MAP['ptx']:
+            fn = self.add_ptx
+        elif kind == FILE_EXTENSION_MAP['cubin']:
+            fn = self.add_cubin
+        elif kind == 'cu':
+            fn = self.add_cu
+        else:
+            raise LinkerError(f"Don't know how to link {kind}")
+
+        fn(data, name)
+
+    def complete(self):
+        # TODO
+        if self._linked:
+            return self._linked
+        result = _CUDALinker(
+            *self._object_codes,
+            options=self.options
+        ).link('cubin')
+        self._linker.close()
+        return result
+
+
 class MVCLinker(Linker):
     """
     Linker supporting Minor Version Compatibility, backed by the cubinlinker

diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -93,7 +93,6 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
         self.debug = debug
         self.lineinfo = lineinfo
         self.extensions = extensions or []
-
         nvvm_options = {
             'fastmath': fastmath,
             'opt': 3 if opt else 0
@@ -406,7 +405,7 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
         stream_handle = stream and stream.handle or zero_stream
 
         # Invoke kernel
-        driver.launch_kernel(cufunc.handle,
+        driver.launch_kernel(cufunc._handle,
                              *griddim,
                              *blockdim,
                              sharedmem,