Skip to content

Commit

Permalink
Merge branch 'v0.6.15' of https://github.com/bytedance/gpu4pyscf into…
Browse files Browse the repository at this point in the history
… v0.6.15
  • Loading branch information
wxj6000 committed Jan 12, 2024
2 parents cc6a282 + 0e784f1 commit bcddd33
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 8 deletions.
1 change: 1 addition & 0 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):

row = intopt.ao_pairs_row[cp_ij_id] - i0
col = intopt.ao_pairs_col[cp_ij_id] - j0

ints_slices = ints_slices[:,col,row]
if cd_low.tag == 'eig':
cderi_block = cupy.dot(cd_low.T, ints_slices)
Expand Down
1 change: 1 addition & 0 deletions gpu4pyscf/dft/numint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
shls_slice=non0shl_idx,
ao_loc_slice=ao_loc_slice,
ctr_offsets_slice=ctr_offsets_slice)

t1 = log.timer_debug2('evaluate ao slice', *t1)
if pad > 0:
if deriv == 0:
Expand Down
49 changes: 46 additions & 3 deletions gpu4pyscf/lib/cupy_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,12 +255,12 @@ def block_diag(blocks, out=None):

def take_last2d(a, indices, out=None):
'''
reorder the last 2 dimensions with 'indices', the first n-2 indices do not change
shape in the last 2 dimensions have to be the same
Reorder the last 2 dimensions as a[..., indices[:,None], indices]
'''
assert a.flags.c_contiguous
assert a.shape[-1] == a.shape[-2]
nao = a.shape[-1]
assert len(indices) == nao
if a.ndim == 2:
count = 1
else:
Expand All @@ -281,7 +281,36 @@ def take_last2d(a, indices, out=None):
raise RuntimeError('failed in take_last2d kernel')
return out

def transpose_sum(a):
def takebak(out, a, indices, axis=-1):
'''(experimental)
Take elements from a NumPy array along an axis and write to CuPy array.
out[..., indices] = a
'''
assert axis == -1
assert isinstance(a, np.ndarray)
assert isinstance(out, cupy.ndarray)
assert out.ndim == a.ndim
assert a.shape[-1] == len(indices)
if a.ndim == 1:
count = 1
else:
assert out.shape[:-1] == a.shape[:-1]
count = np.prod(a.shape[:-1])
n_a = a.shape[-1]
n_o = out.shape[-1]
indices_int32 = cupy.asarray(indices, dtype=cupy.int32)
stream = cupy.cuda.get_current_stream()
err = libcupy_helper.takebak(
ctypes.c_void_p(stream.ptr),
ctypes.c_void_p(out.data.ptr), a.ctypes,
ctypes.c_void_p(indices_int32.data.ptr),
ctypes.c_int(count), ctypes.c_int(n_o), ctypes.c_int(n_a)
)
if err != 0: # Not the mapped host memory
out[...,indices] = cupy.asarray(a)
return out

def transpose_sum(a, stream=None):
'''
return a + a.transpose(0,2,1)
'''
Expand Down Expand Up @@ -514,3 +543,17 @@ def _qr(xs, dot, lindep=1e-14):

def _gen_x0(v, xs):
return cupy.dot(v.T, xs)

def empty_mapped(shape, dtype=float, order='C'):
'''(experimental)
Returns a new, uninitialized NumPy array with the given shape and dtype.
This is a convenience function which is just :func:`numpy.empty`,
except that the underlying buffer is a pinned and mapped memory.
This array can be used as the buffer of zero-copy memory.
'''
nbytes = np.prod(shape) * np.dtype(dtype).itemsize
mem = cupy.cuda.PinnedMemoryPointer(
cupy.cuda.PinnedMemory(nbytes, cupy.cuda.runtime.hostAllocMapped), 0)
out = np.ndarray(shape, dtype=dtype, buffer=mem, order=order)
return out
50 changes: 47 additions & 3 deletions gpu4pyscf/lib/cupy_helper/take_last2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
#include <cuda_runtime.h>
#include <stdio.h>
#define THREADS 32
#define COUNT_BLOCK 80

__global__
static void _take(double *a, const double *b, int *indices, int n)
static void _take_last2d(double *a, const double *b, int *indices, int n)
{
int i = blockIdx.z;
size_t i = blockIdx.z;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.y * blockDim.y + threadIdx.y;
if (j >= n || k >= n) {
Expand All @@ -35,18 +36,61 @@ static void _take(double *a, const double *b, int *indices, int n)
a[off + j * n + k] = b[off + j_b * n + k_b];
}

__global__
static void _takebak(double *out, double *a, int *indices,
int count, int n_o, int n_a)
{
int i0 = blockIdx.y * COUNT_BLOCK;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (j > n_a) {
return;
}

// a is on host with zero-copy memory. We need enough iterations for
// data prefetch to hide latency
int i1 = i0 + COUNT_BLOCK;
if (i1 > count) i1 = count;
int jp = indices[j];
#pragma unroll
for (size_t i = i0; i < i1; ++i) {
out[i * n_o + jp] = a[i * n_a + j];
}
}

extern "C" {
int take_last2d(cudaStream_t stream, double *a, const double *b, int *indices, int blk_size, int n)
{
// reorder j and k in a[i,j,k] with indicies
int ntile = (n + THREADS - 1) / THREADS;
dim3 threads(THREADS, THREADS);
dim3 blocks(ntile, ntile, blk_size);
_take<<<blocks, threads, 0, stream>>>(a, b, indices, n);
_take_last2d<<<blocks, threads, 0, stream>>>(a, b, indices, n);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
return 1;
}
return 0;
}

int takebak(cudaStream_t stream, double *out, double *a_h, int *indices,
int count, int n_o, int n_a)
{
double *a_d;
cudaError_t err;
err = cudaHostGetDevicePointer(&a_d, a_h, 0); // zero-copy check
if (err != cudaSuccess) {
return 1;
}

int ntile = (n_a + THREADS*THREADS - 1) / (THREADS*THREADS);
int ncount = (count + COUNT_BLOCK - 1) / COUNT_BLOCK;
dim3 threads(THREADS*THREADS);
dim3 blocks(ntile, ncount);
_takebak<<<blocks, threads, 0, stream>>>(out, a_d, indices, count, n_o, n_a);
err = cudaGetLastError();
if (err != cudaSuccess) {
return 1;
}
return 0;
}
}
13 changes: 11 additions & 2 deletions gpu4pyscf/lib/tests/test_cupy_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import cupy
from gpu4pyscf.lib.cupy_helper import (
take_last2d, transpose_sum, krylov, unpack_sparse,
add_sparse, dist_matrix)
add_sparse, takebak, empty_mapped, dist_matrix)

class KnownValues(unittest.TestCase):
def test_take_last2d(self):
Expand Down Expand Up @@ -76,6 +76,15 @@ def test_dist_matrix(self):
rij0 = dist_matrix(a)
assert cupy.linalg.norm(rij - rij0) < 1e-10

def test_takebak(self):
a = empty_mapped((5, 8))
a[:] = 1.
idx = numpy.arange(8) * 2
out = cupy.zeros((5, 16))
takebak(out, a, idx)
out[:,idx] -= 1.
assert abs(out).sum() == 0.

if __name__ == "__main__":
print("Full tests for cupy helper module")
unittest.main()
unittest.main()

0 comments on commit bcddd33

Please sign in to comment.