Skip to content

Commit

Permalink
do not force reinit memsys on every kernel launch, add another test
Browse files Browse the repository at this point in the history
  • Loading branch information
isVoid committed Dec 4, 2024
1 parent 8805e99 commit a7d2887
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 15 deletions.
5 changes: 2 additions & 3 deletions numba_cuda/numba/cuda/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,11 +362,10 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):

stream_handle = stream and stream.handle or zero_stream

rtsys.allocate(stream_handle)
rtsys.ensure_allocate(stream_handle)
rtsys.set_memsys_to_module(cufunc.module, stream_handle)
rtsys.initialize(stream_handle)
rtsys.ensure_initialize(stream_handle)
rtsys.enable(stream_handle)
rtsys.print_memsys(0)

# Invoke kernel
driver.launch_kernel(cufunc.handle,
Expand Down
21 changes: 10 additions & 11 deletions numba_cuda/numba/cuda/runtime/nrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _compile_memsys_module(self):

self._memsys_module = module

def _ensure_allocate(self, stream):
def ensure_allocate(self, stream):
if self._memsys is not None:
return

Expand All @@ -50,13 +50,12 @@ def allocate(self, stream):
if self._memsys_module is None:
self._compile_memsys_module()

if self._memsys is None:
# Allocate space for NRT_MemSys
# TODO: determine the size of NRT_MemSys at runtime
self._memsys = device_array((40,), dtype="i1", stream=stream)
# TODO: Memsys module needs a stream that's consistent with the
# system's stream.
self.set_memsys_to_module(self._memsys_module, stream=stream)
# Allocate space for NRT_MemSys
# TODO: determine the size of NRT_MemSys at runtime
self._memsys = device_array((40,), dtype="i1", stream=stream)
# TODO: Memsys module needs a stream that's consistent with the
# system's stream.
self.set_memsys_to_module(self._memsys_module, stream=stream)

def _single_thread_launch(self, module, stream, name, params=()):
func = module.get_function(name)
Expand All @@ -70,7 +69,7 @@ def _single_thread_launch(self, module, stream, name, params=()):
cooperative=False
)

def _ensure_initialize(self, stream):
def ensure_initialize(self, stream):
if self._initialized:
return

Expand All @@ -94,8 +93,8 @@ def disable(self, stream):
self._memsys_module, stream, "NRT_MemSys_disable")

def _copy_memsys_to_host(self, stream=0):
self._ensure_allocate(stream)
self._ensure_initialize(stream)
self.ensure_allocate(stream)
self.ensure_initialize(stream)

# Q: What stream should we execute this on?
dt = np.dtype([
Expand Down
31 changes: 30 additions & 1 deletion numba_cuda/numba/cuda/tests/nrt/test_nrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,44 @@ def kernel():
return None

init_stats = rtsys.get_allocation_stats()
print("init_stats", init_stats)

with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
kernel[1,1]()
print("After kernel launch...")
rtsys.print_memsys(0)
cur_stats = rtsys.get_allocation_stats()
print("cur_stats", cur_stats)
self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
self.assertEqual(cur_stats.free - init_stats.free, n)

def test_escaping_var_init_in_loop(self):
"""
Test issue #1297
"""

@cuda.jit
def g(n):

x = cuda_empty((n, 2), np.float64)

for i in range(n):
y = x[i]

for i in range(n):
y = x[i] # noqa: F841

return None

init_stats = rtsys.get_allocation_stats()
print("init_stats", init_stats)
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1, 1](10)
print("After kernel launch...")
cur_stats = rtsys.get_allocation_stats()
print("cur_stats", cur_stats)
self.assertEqual(cur_stats.alloc - init_stats.alloc, 1)
self.assertEqual(cur_stats.free - init_stats.free, 1)


class TestNrtBasic(CUDATestCase):
def test_nrt_launches(self):
Expand Down

0 comments on commit a7d2887

Please sign in to comment.