Merge branch 'v0.6.15' of https://github.com/bytedance/gpu4pyscf into…

… v0.6.15
bytedance · Jan 12, 2024 · bcddd33 · bcddd33
2 parents cc6a282 + 0e784f1
commit bcddd33
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 8 deletions.
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -250,6 +250,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
 
         row = intopt.ao_pairs_row[cp_ij_id] - i0
         col = intopt.ao_pairs_col[cp_ij_id] - j0
+
         ints_slices = ints_slices[:,col,row]
         if cd_low.tag == 'eig':
             cderi_block = cupy.dot(cd_low.T, ints_slices)

diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
@@ -1305,6 +1305,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
                 shls_slice=non0shl_idx,
                 ao_loc_slice=ao_loc_slice,
                 ctr_offsets_slice=ctr_offsets_slice)
+
             t1 = log.timer_debug2('evaluate ao slice', *t1)
             if pad > 0:
                 if deriv == 0:

diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
@@ -255,12 +255,12 @@ def block_diag(blocks, out=None):
 
 def take_last2d(a, indices, out=None):
     '''
-    reorder the last 2 dimensions with 'indices', the first n-2 indices do not change
-    shape in the last 2 dimensions have to be the same
+    Reorder the last 2 dimensions as a[..., indices[:,None], indices]
     '''
     assert a.flags.c_contiguous
     assert a.shape[-1] == a.shape[-2]
     nao = a.shape[-1]
+    assert len(indices) == nao
     if a.ndim == 2:
         count = 1
     else:
@@ -281,7 +281,36 @@ def take_last2d(a, indices, out=None):
         raise RuntimeError('failed in take_last2d kernel')
     return out
 
-def transpose_sum(a):
+def takebak(out, a, indices, axis=-1):
+    '''(experimental)
+    Take elements from a NumPy array along an axis and write to CuPy array.
+    out[..., indices] = a
+    '''
+    assert axis == -1
+    assert isinstance(a, np.ndarray)
+    assert isinstance(out, cupy.ndarray)
+    assert out.ndim == a.ndim
+    assert a.shape[-1] == len(indices)
+    if a.ndim == 1:
+        count = 1
+    else:
+        assert out.shape[:-1] == a.shape[:-1]
+        count = np.prod(a.shape[:-1])
+    n_a = a.shape[-1]
+    n_o = out.shape[-1]
+    indices_int32 = cupy.asarray(indices, dtype=cupy.int32)
+    stream = cupy.cuda.get_current_stream()
+    err = libcupy_helper.takebak(
+        ctypes.c_void_p(stream.ptr),
+        ctypes.c_void_p(out.data.ptr), a.ctypes,
+        ctypes.c_void_p(indices_int32.data.ptr),
+        ctypes.c_int(count), ctypes.c_int(n_o), ctypes.c_int(n_a)
+    )
+    if err != 0: # Not the mapped host memory
+        out[...,indices] = cupy.asarray(a)
+    return out
+
+def transpose_sum(a, stream=None):
     '''
     return a + a.transpose(0,2,1)
     '''
@@ -514,3 +543,17 @@ def _qr(xs, dot, lindep=1e-14):
 
 def _gen_x0(v, xs):
     return cupy.dot(v.T, xs)
+
+def empty_mapped(shape, dtype=float, order='C'):
+    '''(experimental)
+    Returns a new, uninitialized NumPy array with the given shape and dtype.
+
+    This is a convenience function which is just :func:`numpy.empty`,
+    except that the underlying buffer is a pinned and mapped memory.
+    This array can be used as the buffer of zero-copy memory.
+    '''
+    nbytes = np.prod(shape) * np.dtype(dtype).itemsize
+    mem = cupy.cuda.PinnedMemoryPointer(
+        cupy.cuda.PinnedMemory(nbytes, cupy.cuda.runtime.hostAllocMapped), 0)
+    out = np.ndarray(shape, dtype=dtype, buffer=mem, order=order)
+    return out
diff --git a/gpu4pyscf/lib/cupy_helper/take_last2d.cu b/gpu4pyscf/lib/cupy_helper/take_last2d.cu
@@ -17,11 +17,12 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 #define THREADS        32
+#define COUNT_BLOCK     80
 
 __global__
-static void _take(double *a, const double *b, int *indices, int n)
+static void _take_last2d(double *a, const double *b, int *indices, int n)
 {
-    int i = blockIdx.z;
+    size_t i = blockIdx.z;
     int j = blockIdx.x * blockDim.x + threadIdx.x;
     int k = blockIdx.y * blockDim.y + threadIdx.y;
     if (j >= n || k >= n) {
@@ -35,18 +36,61 @@ static void _take(double *a, const double *b, int *indices, int n)
     a[off + j * n + k] = b[off + j_b * n + k_b];
 }
 
+__global__
+static void _takebak(double *out, double *a, int *indices,
+                     int count, int n_o, int n_a)
+{
+    int i0 = blockIdx.y * COUNT_BLOCK;
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j > n_a) {
+        return;
+    }
+
+    // a is on host with zero-copy memory. We need enough iterations for
+    // data prefetch to hide latency
+    int i1 = i0 + COUNT_BLOCK;
+    if (i1 > count) i1 = count;
+    int jp = indices[j];
+#pragma unroll
+    for (size_t i = i0; i < i1; ++i) {
+        out[i * n_o + jp] = a[i * n_a + j];
+    }
+}
+
 extern "C" {
 int take_last2d(cudaStream_t stream, double *a, const double *b, int *indices, int blk_size, int n)
 {
     // reorder j and k in a[i,j,k] with indicies
     int ntile = (n + THREADS - 1) / THREADS;
     dim3 threads(THREADS, THREADS);
     dim3 blocks(ntile, ntile, blk_size);
-    _take<<<blocks, threads, 0, stream>>>(a, b, indices, n);
+    _take_last2d<<<blocks, threads, 0, stream>>>(a, b, indices, n);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         return 1;
     }
     return 0;
 }
+
+int takebak(cudaStream_t stream, double *out, double *a_h, int *indices,
+            int count, int n_o, int n_a)
+{
+    double *a_d;
+    cudaError_t err;
+    err = cudaHostGetDevicePointer(&a_d, a_h, 0); // zero-copy check
+    if (err != cudaSuccess) {
+        return 1;
+    }
+
+    int ntile = (n_a + THREADS*THREADS - 1) / (THREADS*THREADS);
+    int ncount = (count + COUNT_BLOCK - 1) / COUNT_BLOCK;
+    dim3 threads(THREADS*THREADS);
+    dim3 blocks(ntile, ncount);
+    _takebak<<<blocks, threads, 0, stream>>>(out, a_d, indices, count, n_o, n_a);
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return 1;
+    }
+    return 0;
+}
 }
diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py
@@ -18,7 +18,7 @@
 import cupy
 from gpu4pyscf.lib.cupy_helper import (
     take_last2d, transpose_sum, krylov, unpack_sparse,
-    add_sparse, dist_matrix)
+    add_sparse, takebak, empty_mapped, dist_matrix)
 
 class KnownValues(unittest.TestCase):
     def test_take_last2d(self):
@@ -76,6 +76,15 @@ def test_dist_matrix(self):
         rij0 = dist_matrix(a)
         assert cupy.linalg.norm(rij - rij0) < 1e-10
 
+    def test_takebak(self):
+        a = empty_mapped((5, 8))
+        a[:] = 1.
+        idx = numpy.arange(8) * 2
+        out = cupy.zeros((5, 16))
+        takebak(out, a, idx)
+        out[:,idx] -= 1.
+        assert abs(out).sum() == 0.
+
 if __name__ == "__main__":
     print("Full tests for cupy helper module")
-    unittest.main()
+    unittest.main()