From 7928f2d761ae280546b7cfb771542b99a0a4bea1 Mon Sep 17 00:00:00 2001
From: dlee992 <lidanuaa@gmail.com>
Date: Thu, 22 Aug 2024 23:17:22 -0500
Subject: [PATCH 1/3] copy from numba PR #8458

---
 numba_cuda/numba/cuda/dispatcher.py           |  38 +++--
 numba_cuda/numba/cuda/reshape_funcs.cu        | 151 ++++++++++++++++++
 .../tests/cudapy/test_cuda_array_interface.py |  74 +++++++++
 3 files changed, 254 insertions(+), 9 deletions(-)
 create mode 100644 numba_cuda/numba/cuda/reshape_funcs.cu

diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index 16680a2..63dac6e 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -36,6 +36,8 @@
                         'hrcp', 'hrint',
                         'htrunc', 'hdiv']
 
+reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape']
+
 
 class _Kernel(serialize.ReduceMixin):
     '''
@@ -105,15 +107,33 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
         if self.cooperative:
             lib.needs_cudadevrt = True
 
-        res = [fn for fn in cuda_fp16_math_funcs
-               if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
-
-        if res:
-            # Path to the source containing the foreign function
-            basedir = os.path.dirname(os.path.abspath(__file__))
-            functions_cu_path = os.path.join(basedir,
-                                             'cpp_function_wrappers.cu')
-            link.append(functions_cu_path)
+        def link_to_library_functions(library_functions, library_path,
+                                      prefix=None):
+            """
+            Dynamically links to library functions by searching for their names
+            in the specified library and linking to the corresponding source
+            file.
+            """
+            if prefix is not None:
+                library_functions = [f"{prefix}{fn}" for fn in
+                                     library_functions]
+
+            found_functions = [fn for fn in library_functions
+                               if f'{fn}' in lib.get_asm_str()]
+
+            if found_functions:
+                basedir = os.path.dirname(os.path.abspath(__file__))
+                source_file_path = os.path.join(basedir, library_path)
+                link.append(source_file_path)
+
+            return found_functions
+
+        # Link to the helper library functions if needed
+        link_to_library_functions(reshape_funcs, 'reshape_funcs.cu')
+        # Link to the CUDA FP16 math library functions if needed
+        link_to_library_functions(cuda_fp16_math_funcs,
+                                  'cpp_function_wrappers.cu',
+                                  '__numba_wrapper_')
 
         for filepath in link:
             lib.add_linking_file(filepath)
diff --git a/numba_cuda/numba/cuda/reshape_funcs.cu b/numba_cuda/numba/cuda/reshape_funcs.cu
new file mode 100644
index 0000000..bd91014
--- /dev/null
+++ b/numba_cuda/numba/cuda/reshape_funcs.cu
@@ -0,0 +1,151 @@
+/*
+ * Handle reshaping of zero-sized array.
+ * See numba_attempt_nocopy_reshape() below.
+ */
+#define NPY_MAXDIMS 32
+
+typedef long int npy_intp;
+
+extern "C" __device__ int
+nocopy_empty_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
+                     npy_intp newnd, const npy_intp *newdims,
+                     npy_intp *newstrides, npy_intp itemsize,
+                     int is_f_order)
+{
+    int i;
+    /* Just make the strides vaguely reasonable
+     * (they can have any value in theory).
+     */
+    for (i = 0; i < newnd; i++)
+        newstrides[i] = itemsize;
+    return 1;  /* reshape successful */
+}
+
+/*
+ * Straight from Numpy's _attempt_nocopy_reshape()
+ * (np/core/src/multiarray/shape.c).
+ * Attempt to reshape an array without copying data
+ *
+ * This function should correctly handle all reshapes, including
+ * axes of length 1. Zero strides should work but are untested.
+ *
+ * If a copy is needed, returns 0
+ * If no copy is needed, returns 1 and fills `npy_intp *newstrides`
+ *     with appropriate strides
+ */
+extern "C" __device__ int
+numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
+                             npy_intp newnd, const npy_intp *newdims,
+                             npy_intp *newstrides, npy_intp itemsize,
+                             int is_f_order)
+{
+    int oldnd;
+    npy_intp olddims[NPY_MAXDIMS];
+    npy_intp oldstrides[NPY_MAXDIMS];
+    npy_intp np, op, last_stride;
+    int oi, oj, ok, ni, nj, nk;
+
+    oldnd = 0;
+    /*
+     * Remove axes with dimension 1 from the old array. They have no effect
+     * but would need special cases since their strides do not matter.
+     */
+    for (oi = 0; oi < nd; oi++) {
+        if (dims[oi]!= 1) {
+            olddims[oldnd] = dims[oi];
+            oldstrides[oldnd] = strides[oi];
+            oldnd++;
+        }
+    }
+
+    np = 1;
+    for (ni = 0; ni < newnd; ni++) {
+        np *= newdims[ni];
+    }
+    op = 1;
+    for (oi = 0; oi < oldnd; oi++) {
+        op *= olddims[oi];
+    }
+    if (np != op) {
+        /* different total sizes; no hope */
+        return 0;
+    }
+
+    if (np == 0) {
+        /* the Numpy code does not handle 0-sized arrays */
+        return nocopy_empty_reshape(nd, dims, strides,
+                                    newnd, newdims, newstrides,
+                                    itemsize, is_f_order);
+    }
+
+    /* oi to oj and ni to nj give the axis ranges currently worked with */
+    oi = 0;
+    oj = 1;
+    ni = 0;
+    nj = 1;
+    while (ni < newnd && oi < oldnd) {
+        np = newdims[ni];
+        op = olddims[oi];
+
+        while (np != op) {
+            if (np < op) {
+                /* Misses trailing 1s, these are handled later */
+                np *= newdims[nj++];
+            } else {
+                op *= olddims[oj++];
+            }
+        }
+
+        /* Check whether the original axes can be combined */
+        for (ok = oi; ok < oj - 1; ok++) {
+            if (is_f_order) {
+                if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok]) {
+                     /* not contiguous enough */
+                    return 0;
+                }
+            }
+            else {
+                /* C order */
+                if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1]) {
+                    /* not contiguous enough */
+                    return 0;
+                }
+            }
+        }
+
+        /* Calculate new strides for all axes currently worked with */
+        if (is_f_order) {
+            newstrides[ni] = oldstrides[oi];
+            for (nk = ni + 1; nk < nj; nk++) {
+                newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
+            }
+        }
+        else {
+            /* C order */
+            newstrides[nj - 1] = oldstrides[oj - 1];
+            for (nk = nj - 1; nk > ni; nk--) {
+                newstrides[nk - 1] = newstrides[nk]*newdims[nk];
+            }
+        }
+        ni = nj++;
+        oi = oj++;
+    }
+
+    /*
+     * Set strides corresponding to trailing 1s of the new shape.
+     */
+    if (ni >= 1) {
+        last_stride = newstrides[ni - 1];
+    }
+    else {
+        last_stride = itemsize;
+    }
+    if (is_f_order) {
+        last_stride *= newdims[ni - 1];
+    }
+    for (nk = ni; nk < newnd; nk++) {
+        newstrides[nk] = last_stride;
+    }
+
+    return 1;
+}
\ No newline at end of file
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py b/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py
index 6448f45..e7ec698 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py
@@ -9,6 +9,31 @@
 from unittest.mock import call, patch
 
 
+def array_reshape1d(arr, newshape, got):
+    y = arr.reshape(newshape)
+    for i in range(y.shape[0]):
+        got[i] = y[i]
+
+
+def array_reshape2d(arr, newshape, got):
+    y = arr.reshape(newshape)
+    for i in range(y.shape[0]):
+        for j in range(y.shape[1]):
+            got[i, j] = y[i, j]
+
+
+def array_reshape3d(arr, newshape, got):
+    y = arr.reshape(newshape)
+    for i in range(y.shape[0]):
+        for j in range(y.shape[1]):
+            for k in range(y.shape[2]):
+                got[i, j, k] = y[i, j, k]
+
+
+def array_reshape(arr, newshape):
+    return arr.reshape(newshape)
+
+
 @skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
 class TestCudaArrayInterface(ContextResettingTestCase):
     def assertPointersEqual(self, a, b):
@@ -430,6 +455,55 @@ def f(x, y):
             # Ensure that synchronize was not called
             mock_sync.assert_not_called()
 
+    # @skip_unless_cuda_python('NVIDIA Binding needed for NVRTC')
+    def test_array_reshape(self):
+        def check(pyfunc, kernelfunc, arr, shape):
+            kernel = cuda.jit(kernelfunc)
+            expected = pyfunc(arr, shape)
+            got = np.zeros(expected.shape, dtype=arr.dtype)
+            kernel[1, 1](arr, shape, got)
+            self.assertPreciseEqual(got, expected)
+
+        def check_only_shape(kernelfunc, arr, shape, expected_shape):
+            kernel = cuda.jit(kernelfunc)
+            got = np.zeros(expected_shape, dtype=arr.dtype)
+            kernel[1, 1](arr, shape, got)
+            self.assertEqual(got.shape, expected_shape)
+            self.assertEqual(got.size, arr.size)
+
+        # 0-sized arrays
+        def check_empty(arr):
+            check(array_reshape, array_reshape1d, arr, 0)
+            check(array_reshape, array_reshape1d, arr, (0,))
+            check(array_reshape, array_reshape3d, arr, (1, 0, 2))
+            check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
+            check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
+            check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
+
+        # C-contiguous
+        arr = np.arange(24)
+        check(array_reshape, array_reshape1d, arr, (24,))
+        check(array_reshape, array_reshape2d, arr, (4, 6))
+        check(array_reshape, array_reshape2d, arr, (8, 3))
+        check(array_reshape, array_reshape3d, arr, (8, 1, 3))
+
+        arr = np.arange(24).reshape((1, 8, 1, 1, 3, 1))
+        check(array_reshape, array_reshape1d, arr, (24,))
+        check(array_reshape, array_reshape2d, arr, (4, 6))
+        check(array_reshape, array_reshape2d, arr, (8, 3))
+        check(array_reshape, array_reshape3d, arr, (8, 1, 3))
+
+        # Test negative shape value
+        arr = np.arange(25).reshape(5,5)
+        check(array_reshape, array_reshape1d, arr, -1)
+        check(array_reshape, array_reshape1d, arr, (-1,))
+        check(array_reshape, array_reshape2d, arr, (-1, 5))
+        check(array_reshape, array_reshape3d, arr, (5, -1, 5))
+        check(array_reshape, array_reshape3d, arr, (5, 5, -1))
+
+        arr = np.array([])
+        check_empty(arr)
+
 
 if __name__ == "__main__":
     unittest.main()

From bd3d0a6f48295ba944580bf9ee11c979a48e39bd Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 29 Nov 2024 15:49:14 +0000
Subject: [PATCH 2/3] Fix flake8

---
 numba_cuda/numba/cuda/dispatcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index 64ebc77..7c56e70 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -152,7 +152,7 @@ def link_to_library_functions(library_functions, library_path,
         link_to_library_functions(cuda_fp16_math_funcs,
                                   'cpp_function_wrappers.cu',
                                   '__numba_wrapper_')
-        
+
         self.maybe_link_nrt(link, tgt_ctx, asm)
 
         for filepath in link:

From 4c90fef1453996ac6676213d4665eb3711e554a7 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 29 Nov 2024 21:45:53 +0000
Subject: [PATCH 3/3] Use long long int for npy_intp in reshape_funcs.cu

This is 64 bits on Linux and Windows, which corrects an issue with the
prototypes not matching on Windows.
---
 numba_cuda/numba/cuda/reshape_funcs.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/reshape_funcs.cu b/numba_cuda/numba/cuda/reshape_funcs.cu
index bd91014..123bfed 100644
--- a/numba_cuda/numba/cuda/reshape_funcs.cu
+++ b/numba_cuda/numba/cuda/reshape_funcs.cu
@@ -4,7 +4,7 @@
  */
 #define NPY_MAXDIMS 32
 
-typedef long int npy_intp;
+typedef long long int npy_intp;
 
 extern "C" __device__ int
 nocopy_empty_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,