Skip to content

Commit

Permalink
Merge branch 'master' into uks_dev
Browse files Browse the repository at this point in the history
  • Loading branch information
wxj6000 authored Jan 24, 2024
2 parents 4c7acb2 + b501107 commit 3f311a4
Show file tree
Hide file tree
Showing 54 changed files with 1,514 additions and 599 deletions.
3 changes: 3 additions & 0 deletions dockerfiles/manylinux/build_wheels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ export CUTENSOR_DIR=/usr/local/cuda
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH

# blas is required by DFTD3 and DFTD4
yum install -y openblas-devel

# Compile wheels
rm -rf /gpu4pyscf/wheelhouse
for PYBIN in /opt/python/cp311-cp311/bin; do
Expand Down
2 changes: 1 addition & 1 deletion examples/dft_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@
parser.add_argument("--solvent", type=str, default='')
args = parser.parse_args()

lib.num_threads(16)
start_time = time.time()
bas = args.basis
mol = pyscf.M(
atom=args.input,
basis=bas,
max_memory=32000)
# set verbose >= 6 for debugging timer

mol.verbose = 4

mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
Expand Down
10 changes: 5 additions & 5 deletions gpu4pyscf/__config__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,24 @@
GB = 1024*1024*1024
# such as A100-80G
if props['totalGlobalMem'] >= 64 * GB:
min_ao_blksize = 256
min_grid_blksize = 256*256
min_ao_blksize = 128
min_grid_blksize = 128*128
ao_aligned = 32
grid_aligned = 128
mem_fraction = 0.9
number_of_threads = 2048 * 108
# such as V100-32G
elif props['totalGlobalMem'] >= 32 * GB:
min_ao_blksize = 128
min_grid_blksize = 256*256#128*128
min_grid_blksize = 128*128
ao_aligned = 32
grid_aligned = 128
mem_fraction = 0.9
number_of_threads = 1024 * 80
# such as A30-24GB
elif props['totalGlobalMem'] >= 16 * GB:
min_ao_blksize = 128
min_grid_blksize = 256*256#128*128
min_grid_blksize = 128*128
ao_aligned = 32
grid_aligned = 128
mem_fraction = 0.9
Expand All @@ -35,4 +35,4 @@
mem_fraction = 0.9
number_of_threads = 1024 * 80

cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
3 changes: 2 additions & 1 deletion gpu4pyscf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from . import lib, grad, hessian, solvent, scf, dft
__version__ = '0.6.13'

__version__ = '0.6.17'

# monkey patch libxc reference due to a bug in nvcc
from pyscf.dft import libxc
Expand Down
38 changes: 13 additions & 25 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from pyscf import lib
from pyscf.df import df, addons
from gpu4pyscf.lib.cupy_helper import (
cholesky, tag_array, get_avail_mem, cart2sph, take_last2d)
cholesky, tag_array, get_avail_mem, cart2sph, take_last2d, transpose_sum)
from gpu4pyscf.df import int3c2e, df_jk
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
Expand Down Expand Up @@ -57,18 +57,6 @@ def build(self, direct_scf_tol=1e-14, omega=None):
auxmol = self.auxmol
self.nao = mol.nao

# cache indices for better performance
nao = mol.nao
tril_row, tril_col = cupy.tril_indices(nao)
tril_row = cupy.asarray(tril_row)
tril_col = cupy.asarray(tril_col)

self.tril_row = tril_row
self.tril_col = tril_col

idx = np.arange(nao)
self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx)

log = logger.new_logger(mol, mol.verbose)
t0 = log.init_timer()
if auxmol is None:
Expand Down Expand Up @@ -147,7 +135,7 @@ def loop(self, blksize=None, unpack=True):
rows = self.intopt.cderi_row
cols = self.intopt.cderi_col
buf_prefetch = None

buf_cderi = cupy.zeros([blksize,nao,nao])
data_stream = cupy.cuda.stream.Stream(non_blocking=True)
compute_stream = cupy.cuda.get_current_stream()
#compute_stream = cupy.cuda.stream.Stream()
Expand All @@ -165,14 +153,15 @@ def loop(self, blksize=None, unpack=True):
buf_prefetch.set(cderi_sparse[p1:p2,:])
stop_event = data_stream.record()
if unpack:
buf2 = cupy.zeros([p1-p0,nao,nao])
buf2[:p1-p0,rows,cols] = buf
buf2[:p1-p0,cols,rows] = buf
buf_cderi[:p1-p0,rows,cols] = buf
buf_cderi[:p1-p0,cols,rows] = buf
buf2 = buf_cderi[:p1-p0]
else:
buf2 = None
yield buf2, buf.T
compute_stream.wait_event(stop_event)
cupy.cuda.Device().synchronize()
if isinstance(cderi_sparse, np.ndarray):
cupy.cuda.Device().synchronize()

if buf_prefetch is not None:
buf = buf_prefetch
Expand Down Expand Up @@ -217,8 +206,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)
except Exception:
raise RuntimeError('Out of CPU memory')

data_stream = cupy.cuda.stream.Stream(non_blocking=False)
if(not use_gpu_memory):
data_stream = cupy.cuda.stream.Stream(non_blocking=False)
count = 0
nq = len(intopt.log_qs)
for cp_ij_id, _ in enumerate(intopt.log_qs):
Expand Down Expand Up @@ -247,6 +236,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
int3c2e.get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, out=ints_slices[k0:k1], omega=omega)
ints_slices -= ints_slices_lr
else:
# Initialization is required due to cutensor operations later
ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
for cp_kl_id, _ in enumerate(intopt.aux_log_qs):
k0 = intopt.sph_aux_loc[cp_kl_id]
Expand All @@ -261,10 +251,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):

row = intopt.ao_pairs_row[cp_ij_id] - i0
col = intopt.ao_pairs_col[cp_ij_id] - j0
if cpi == cpj:
ints_slices = ints_slices + ints_slices.transpose([0,2,1])
ints_slices = ints_slices[:,col,row]

ints_slices = ints_slices[:,col,row]
if cd_low.tag == 'eig':
cderi_block = cupy.dot(cd_low.T, ints_slices)
ints_slices = None
Expand All @@ -280,8 +268,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
for i in range(naux):
cderi_block[i].get(out=cderi[i,ij0:ij1])
t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq}', *t1)

cupy.cuda.Device().synchronize()
if not use_gpu_memory:
cupy.cuda.Device().synchronize()
return cderi


10 changes: 4 additions & 6 deletions gpu4pyscf/df/df_jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def build_df():
rks.initialize_grids(mf, mf.mol, dm0)
ni.build(mf.mol, mf.grids.coords)
mf._numint.xcfuns = numint._init_xcfuns(mf.xc, dm0.ndim==3)
dm0 = cupy.asarray(dm0)
return

def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False):
Expand Down Expand Up @@ -269,18 +268,18 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
nao = dms_tag.shape[-1]
dms = dms_tag.reshape([-1,nao,nao])
nset = dms.shape[0]
t0 = log.init_timer()
t1 = t0 = log.init_timer()
if dfobj._cderi is None:
log.debug('CDERI not found, build...')
dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega)
t1 = log.timer_debug1('init jk', *t0)

assert nao == dfobj.nao
vj = None
vk = None
ao_idx = dfobj.intopt.sph_ao_idx
dms = take_last2d(dms, ao_idx)

t1 = log.timer_debug1('init jk', *t0)
rows = dfobj.intopt.cderi_row
cols = dfobj.intopt.cderi_col
if with_j:
Expand All @@ -306,15 +305,14 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-
vj_packed += cupy.dot(rhoj, cderi_sparse.T)
if with_k:
rhok = contract('Lij,jk->Lki', cderi, occ_coeff)
#vk[0] += contract('Lki,Lkj->ij', rhok, rhok)
cublas.syrk('T', rhok.reshape([-1,nao]), out=vk[0], alpha=1.0, beta=1.0, lower=True)
#vk[0] += 2.0 * contract('Lki,Lkj->ij', rhok, rhok)
cublas.syrk('T', rhok.reshape([-1,nao]), out=vk[0], alpha=2.0, beta=1.0, lower=True)
if with_j:
vj[:,rows,cols] = vj_packed
vj[:,cols,rows] = vj_packed
if with_k:
vk[0][numpy.diag_indices(nao)] *= 0.5
transpose_sum(vk)
vk *= 2.0
# CP-HF K matrix
elif hasattr(dms_tag, 'mo1'):
if with_j:
Expand Down
2 changes: 1 addition & 1 deletion gpu4pyscf/df/grad/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
mo_occ = cupy.asarray(mf_grad.base.mo_occ)
sph_ao_idx = intopt.sph_ao_idx
dm = take_last2d(dm0, sph_ao_idx)
orbo = contract('pi,i->pi', mo_coeff[:,mo_occ>0], numpy.sqrt(mo_occ[mo_occ>0]))
orbo = mo_coeff[:,mo_occ>0] * mo_occ[mo_occ>0] ** 0.5
orbo = orbo[sph_ao_idx, :]
nocc = orbo.shape[-1]

Expand Down
14 changes: 10 additions & 4 deletions gpu4pyscf/df/hessian/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import cupy
import numpy as np
from pyscf import lib, df
from gpu4pyscf.grad import rhf as rhf_grad
from gpu4pyscf.hessian import rhf as rhf_hess
from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info, take_last2d
from gpu4pyscf.df import int3c2e
Expand Down Expand Up @@ -283,6 +284,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00)
rho2c_0 = rho2c_10 = rho2c_11 = rho2c0_10 = rho2c1_10 = rho2c0_11 = int2c_ip_ip = None
wk_ip2_P__ = int2c_ip1_inv = None
t1 = log.timer_debug1('contract int2c_*', *t1)

ao_idx = np.argsort(intopt.sph_ao_idx)
aux_idx = np.argsort(intopt.sph_aux_idx)
rev_ao_ao = cupy.ix_(ao_idx, ao_idx)
Expand Down Expand Up @@ -372,7 +375,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
e1[j0,i0] = e1[i0,j0].T
ej[j0,i0] = ej[i0,j0].T
ek[j0,i0] = ek[i0,j0].T

t1 = log.timer_debug1('hcore contribution', *t1)
log.timer('RHF partial hessian', *time0)
return e1, ej, ek

Expand All @@ -398,6 +401,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
else:
return chkfile
'''

def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
verbose=None, with_k=True, omega=None):
log = logger.new_logger(hessobj, verbose)
Expand Down Expand Up @@ -521,8 +525,9 @@ def _ao2mo(mat):
vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2
vk1_int3c_ip1 = vk1_int3c_ip2 = None

grad_hcore = rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method())
cupy.get_default_memory_pool().free_all_blocks()
hcore_deriv = hessobj.base.nuc_grad_method().hcore_generator(mol)
#hcore_deriv = hessobj.base.nuc_grad_method().hcore_generator(mol)
vk1 = None
for i0, ia in enumerate(atmlst):
shl0, shl1, p0, p1 = aoslices[ia]
Expand All @@ -535,8 +540,9 @@ def _ao2mo(mat):
vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:]
vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)

h1 = hcore_deriv(ia)
h1 = _ao2mo(cupy.asarray(h1, order='C'))
h1 = grad_hcore[:,i0]
#h1 = hcore_deriv(ia)
#h1 = _ao2mo(cupy.asarray(h1, order='C'))
vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
if with_k:
vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
Expand Down
46 changes: 27 additions & 19 deletions gpu4pyscf/df/int3c2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,10 +306,7 @@ def build(self, cutoff=1e-14, group_size=None,
ncptype = len(log_qs)

self.bpcache = ctypes.POINTER(BasisProdCache)()
if diag_block_with_triu:
scale_shellpair_diag = 1.
else:
scale_shellpair_diag = 0.5
scale_shellpair_diag = 1.
libgint.GINTinit_basis_prod(
ctypes.byref(self.bpcache), ctypes.c_double(scale_shellpair_diag),
ao_loc.ctypes.data_as(ctypes.c_void_p),
Expand Down Expand Up @@ -1194,6 +1191,32 @@ def get_dh1e(mol, dm0):
dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1])
return 2.0 * cupy.einsum('kx,k->kx', dh1e, -charges)

def get_d2h1e(mol, dm0):
natm = mol.natm
coords = mol.atom_coords()
charges = mol.atom_charges()
fakemol = gto.fakemol_for_charges(coords)

nao = mol.nao
d2h1e_diag = cupy.zeros([natm,9])
d2h1e_offdiag = cupy.zeros([natm, nao, 9])
intopt = VHFOpt(mol, fakemol, 'int2e')
intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
dm0_sorted = take_last2d(dm0, intopt.sph_ao_idx)
for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'):
d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1])
d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1])

for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1'):
d2h1e_diag[k0:k1,:9] -= contract('xaji,ij->ax', int3c_blk, dm0_sorted[i0:i1,j0:j1])
d2h1e_offdiag[k0:k1,i0:i1,:9] += contract('xaji,ij->aix', int3c_blk, dm0_sorted[i0:i1,j0:j1])
aoslices = mol.aoslice_by_atom()
ao2atom = get_ao2atom(intopt, aoslices)
d2h1e = contract('aix,ib->abx', d2h1e_offdiag, ao2atom)
d2h1e[np.diag_indices(natm), :] += d2h1e_diag
return 2.0 * cupy.einsum('abx,a->xab', d2h1e, charges)
#return 2.0 * cupy.einsum('ijx,i->kx', dh1e, -charges)

def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=None, stream=None):
'''
Generate one int3c2e block for given ij, k
Expand Down Expand Up @@ -1443,14 +1466,6 @@ def get_pairing(p_offsets, q_offsets, q_cond,
for q0, q1 in zip(q_offsets[:-1], q_offsets[1:]):
if aosym and q0 < p0 or not aosym:
q_sub = q_cond[p0:p1,q0:q1].ravel()
'''
idx = q_sub.argsort(axis=None)[::-1]
q_sorted = q_sub[idx]
mask = q_sorted > cutoff
idx = idx[mask]
ishs, jshs = np.unravel_index(idx, (p1-p0, q1-q0))
print(ishs.shape)
'''
mask = q_sub > cutoff
ishs, jshs = np.indices((p1-p0,q1-q0))
ishs = ishs.ravel()[mask]
Expand All @@ -1464,13 +1479,6 @@ def get_pairing(p_offsets, q_offsets, q_cond,
log_qs.append(log_q)
elif aosym and p0 == q0 and p1 == q1:
q_sub = q_cond[p0:p1,p0:p1].ravel()
'''
idx = q_sub.argsort(axis=None)[::-1]
q_sorted = q_sub[idx]
ishs, jshs = np.unravel_index(idx, (p1-p0, p1-p0))
mask = q_sorted > cutoff
'''

ishs, jshs = np.indices((p1-p0, p1-p0))
ishs = ishs.ravel()
jshs = jshs.ravel()
Expand Down
26 changes: 26 additions & 0 deletions gpu4pyscf/df/tests/test_int3c2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,32 @@ def test_int1e_iprinv(self):
h1ao = mol.intor('int1e_iprinv', comp=3) # <\nabla|1/r|>
assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8

def test_int1e_ipiprinv(self):
from pyscf import gto
coords = mol.atom_coords()
charges = mol.atom_charges()

fakemol = gto.fakemol_for_charges(coords)
int3c = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipip1').get()

for i,q in enumerate(charges):
mol.set_rinv_origin(coords[i])
h1ao = mol.intor('int1e_ipiprinv', comp=9) # <\nabla|1/r|>
assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8

def test_int1e_iprinvip(self):
from pyscf import gto
coords = mol.atom_coords()
charges = mol.atom_charges()

fakemol = gto.fakemol_for_charges(coords)
int3c = int3c2e.get_int3c2e_general(mol, fakemol, ip_type='ipvip1').get()

for i,q in enumerate(charges):
mol.set_rinv_origin(coords[i])
h1ao = mol.intor('int1e_iprinvip', comp=9) # <\nabla|1/r|>
assert np.linalg.norm(int3c[:,:,:,i] - h1ao) < 1e-8

if __name__ == "__main__":
print("Full Tests for int3c")
unittest.main()
Loading

0 comments on commit 3f311a4

Please sign in to comment.