diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index f1128fc5..4eb534e3 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -13,10 +13,35 @@ permissions: contents: read jobs: - build: - - runs-on: self-hosted + single-gpu: + runs-on: [self-hosted, Linux, X64, v100] + steps: + - uses: actions/checkout@v3 + - name: Install dependencies + run: | + pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + python3 -m pip install --upgrade pip + pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion + pip3 install pyscf --upgrade + pip3 install git+https://github.com/pyscf/properties --upgrade + pip3 install numpy --upgrade + pip3 install h5py --upgrade + pip3 install gpu4pyscf-libxc-cuda12x --upgrade + pip3 install cupy-cuda12x --upgrade + git config --global core.compression 9 + - name: Build GPU4PySCF + run: | + export CUDA_HOME=/usr/local/cuda + export CMAKE_CONFIGURE_ARGS="-DBUILD_LIBXC=OFF -DCUDA_ARCHITECTURES=70-real -DBUILD_CUTLASS=ON" + sh build.sh + - name: Test with pytest + run: | + echo $GITHUB_WORKSPACE + export PYTHONPATH="${PYTHONPATH}:$(pwd)" + pytest -m "not smoke" --cov=$GITHUB_WORKSPACE + multi-gpu: + runs-on: [self-hosted, Linux, X64, 2T4] steps: - uses: actions/checkout@v3 - name: Install dependencies diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 872268d7..e5185498 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -86,7 +86,7 @@ def build(self, direct_scf_tol=1e-14, omega=None): j2c = cupy.asarray(j2c_cpu, order='C') t0 = log.timer_debug1('2c2e', *t0) intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, + intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=GROUP_SIZE, group_size_aux=GROUP_SIZE) log.timer_debug1('prepare intopt', *t0) self.j2c = j2c.copy() @@ -105,7 +105,7 @@ def build(self, direct_scf_tol=1e-14, omega=None): naux = self.naux = self.cd_low.shape[1] log.debug('size of aux basis %d', naux) - self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, + self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, omega=omega, use_gpu_memory=self.use_gpu_memory) log.timer_debug1('cholesky_eri', *t0) self.intopt = intopt @@ -144,8 +144,8 @@ def get_blksize(self, extra=0, nao=None): return blksize def loop(self, blksize=None, unpack=True): - ''' loop over cderi for the current device - and unpack the CDERI in (Lij) format + ''' loop over cderi for the current device + and unpack the CDERI in (Lij) format ''' device_id = cupy.cuda.Device().id cderi_sparse = self._cderi[device_id] @@ -177,10 +177,10 @@ def loop(self, blksize=None, unpack=True): yield buf2, buf.T if isinstance(cderi_sparse, np.ndarray): cupy.cuda.Device().synchronize() - + if buf_prefetch is not None: buf = buf_prefetch - + def reset(self, mol=None): '''Reset mol and clean up relevant attributes for scanner mode''' if mol is not None: @@ -198,7 +198,7 @@ def reset(self, mol=None): get_ao_eri = get_eri = NotImplemented get_mo_eri = ao2mo = NotImplemented -def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, +def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False, use_gpu_memory=True): ''' Returns: @@ -210,13 +210,13 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, # Available memory on Device 0. avail_mem = get_avail_mem() - + if use_gpu_memory: # CDERI will be equally distributed to the devices # Other devices usually have more memory available than Device 0 # CDERI will use up to 40% of the available memory use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices - + if use_gpu_memory: log.debug("Saving CDERI on GPU") else: @@ -235,7 +235,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, _cderi[device_id] = cderi_blk npairs_per_ctr = [len(intopt.ao_pairs_row[cp_ij_id]) for cp_ij_id in range(len(intopt.log_qs))] - + npairs_per_ctr = np.array(npairs_per_ctr) total_task_list = np.argsort(npairs_per_ctr) task_list_per_device = [] @@ -253,13 +253,13 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, omega=omega, sr_only=sr_only, device_id=device_id) futures.append(future) - + for future in futures: future.result() - + if not use_gpu_memory: cupy.cuda.Device().synchronize() - + return _cderi def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0): @@ -273,6 +273,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de pairs_loc = np.append(0, np.cumsum(npairs)) blksize = (naux + _num_devices - 1) // _num_devices with cupy.cuda.Device(device_id), _streams[device_id]: + assert isinstance(mol.verbose, int) log = logger.new_logger(mol, mol.verbose) t1 = log.init_timer() cd_low_tag = cd_low.tag @@ -320,7 +321,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de row = intopt.ao_pairs_row[cp_ij_id] - i0 col = intopt.ao_pairs_col[cp_ij_id] - j0 - + ints_slices_f= cupy.empty([naoaux,len(row)], order='F') ints_slices_f[:] = ints_slices[:,col,row] ints_slices = None @@ -330,12 +331,12 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de elif cd_low_tag == 'cd': cderi_block = solve_triangular(cd_low, ints_slices_f, lower=True, overwrite_b=True) else: - RuntimeError('Tag is not found in lower triangular matrix.') + raise RuntimeError('Tag is not found in lower triangular matrix.') t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq} on Device {device_id}', *t1) - # TODO: + # TODO: # 1) async data transfer - # 2) auxiliary basis in the last dimension + # 2) auxiliary basis in the last dimension # if CDERI is saved on CPU ij0 = pairs_loc[cp_ij_id] diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index e8c8e37a..984ea2e2 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -249,6 +249,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ, ''' Calculate J and K matrices on single GPU ''' with cupy.cuda.Device(device_id), _streams[device_id]: + assert isinstance(dfobj.verbose, int) log = logger.new_logger(dfobj.mol, dfobj.verbose) t0 = log.init_timer() dms = cupy.asarray(dms) @@ -313,6 +314,7 @@ def _jk_task_with_mo1(dfobj, dms, mo1s, occ_coeffs, ''' vj = vk = None with cupy.cuda.Device(device_id), _streams[device_id]: + assert isinstance(dfobj.verbose, int) log = logger.new_logger(dfobj.mol, dfobj.verbose) t0 = log.init_timer() dms = cupy.asarray(dms) @@ -373,6 +375,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0) ''' Calculate J and K matrices with density matrix ''' with cupy.cuda.Device(device_id), _streams[device_id]: + assert isinstance(dfobj.verbose, int) log = logger.new_logger(dfobj.mol, dfobj.verbose) t0 = log.init_timer() dms = cupy.asarray(dms) @@ -404,7 +407,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0) for k in range(nset): rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao]) #vk[k] += contract('Lki,Lkj->ij', rhok, cderi) - vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao])) + vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao])) if with_j: vj = cupy.zeros(dms_shape) vj[:,rows,cols] = vj_sparse @@ -437,7 +440,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e- assert nao == dfobj.nao intopt = dfobj.intopt - + nao = dms_tag.shape[-1] dms = dms_tag.reshape([-1,nao,nao]) intopt = dfobj.intopt @@ -456,7 +459,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e- with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _jk_task_with_mo, + _jk_task_with_mo, dfobj, dms, mo_coeff, mo_occ, hermi=hermi, device_id=device_id, with_j=with_j, with_k=with_k) @@ -477,7 +480,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e- with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _jk_task_with_mo1, + _jk_task_with_mo1, dfobj, dms, mo1s, occ_coeffs, hermi=hermi, device_id=device_id, with_j=with_j, with_k=with_k) diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py index a8039e54..95a5f021 100644 --- a/gpu4pyscf/df/grad/jk.py +++ b/gpu4pyscf/df/grad/jk.py @@ -25,6 +25,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0): rhoj = rhok = None with cupy.cuda.Device(device_id), _streams[device_id]: log = logger.new_logger(with_df.mol, with_df.verbose) + assert isinstance(with_df.verbose, int) t0 = log.init_timer() dm = cupy.asarray(dm) orbo = cupy.asarray(orbo) @@ -34,7 +35,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0): cols = with_df.intopt.cderi_col dm_sparse = dm[rows, cols] dm_sparse[with_df.intopt.cderi_diag] *= .5 - + blksize = with_df.get_blksize() if with_j: rhoj = cupy.empty([naux_slice]) @@ -65,18 +66,18 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True): _jk_task, with_df, dm, orbo, with_j=with_j, with_k=with_k, device_id=device_id) futures.append(future) - + rhoj_total = [] rhok_total = [] for future in futures: rhoj, rhok = future.result() rhoj_total.append(rhoj) rhok_total.append(rhok) - + rhoj = rhok = None if with_j: rhoj = concatenate(rhoj_total) if with_k: rhok = concatenate(rhok_total) - + return rhoj, rhok diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 5ff64fc2..2e5a8da4 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -91,7 +91,7 @@ def eval_ao(mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice= coords = cupy.asarray(coords.T, order='C') comp = (deriv+1)*(deriv+2)*(deriv+3)//6 stream = cupy.cuda.get_current_stream() - + # ao must be set to zero due to implementation if deriv > 1: if out is None: @@ -123,7 +123,7 @@ def eval_ao(mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice= if transpose: out = out.transpose(0,2,1) - + if deriv == 0: out = out[0] return out @@ -397,7 +397,7 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): vxc[1,threshind] = 1.5*W*dW0dG return exc,vxc -def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, +def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, verbose=None, with_lapl=False, grid_range=(), device_id=0, hermi=1): ''' nr_rks task on given device ''' @@ -405,8 +405,8 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, if dms is not None: dms = cupy.asarray(dms) if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff) if mo_occ is not None: mo_occ = cupy.asarray(mo_occ) - - log = logger.new_logger(mol) + assert isinstance(verbose, int) + log = logger.new_logger(mol, verbose) t0 = log.init_timer() xctype = ni._xc_type(xc_code) nao = mol.nao @@ -417,13 +417,13 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ao_deriv = 0 else: ao_deriv = 1 - + ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices grid_start = device_id * ngrids_per_device grid_end = (device_id + 1) * ngrids_per_device ngrids_local = grid_end - grid_start - + weights = cupy.empty([ngrids_local]) if xctype == 'LDA': rho_tot = cupy.empty([nset,1,ngrids_local]) @@ -434,22 +434,22 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, p0 = p1 = 0 for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, - max_memory=None, + max_memory=None, grid_range=(grid_start, grid_end)): p1 = p0 + weight.size weights[p0:p1] = weight for i in range(nset): if mo_coeff is None: - rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], + rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], xctype=xctype, hermi=hermi, with_lapl=with_lapl) else: assert hermi == 1 mo_coeff_mask = mo_coeff[idx,:] - rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, + rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype, with_lapl) p0 = p1 t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0) - + # libxc calls are still running on default stream nelec = cupy.zeros(nset) excsum = cupy.zeros(nset) @@ -471,7 +471,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, vmat = cupy.zeros((nset, nao, nao)) p0 = p1 = 0 for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, - max_memory=None, + max_memory=None, grid_range=(grid_start, grid_end)): p1 = p0 + weight.size for i in range(nset): @@ -523,7 +523,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, future = executor.submit( _nr_rks_task, ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, - verbose=verbose, device_id=device_id, hermi=hermi) + verbose=log.verbose, device_id=device_id, hermi=hermi) futures.append(future) vmat_dist = [] nelec_dist = [] @@ -552,7 +552,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, t0 = log.timer_debug1('nr_rks', *t0) return nelec, excsum, vmat -def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, +def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, non0tab=None, xctype='LDA', with_lapl=False, verbose=None, out=None): groups = len(ao_group) @@ -684,18 +684,18 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, rho_tot = cupy.empty([nset,5,ngrids]) p0 = p1 = 0 t1 = t0 = log.init_timer() - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): p1 = p0 + weight.size for i in range(nset): if mo_coeff is None: rho_tot[i,:,p0:p1] = eval_rho( - _sorted_mol, ao_mask, dms[i][idx[:,None],idx], + _sorted_mol, ao_mask, dms[i][idx[:,None],idx], xctype=xctype, hermi=1) else: assert hermi == 1 mo_coeff_mask = mo_coeff[idx,:] - rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, + rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype) p0 = p1 t1 = log.timer_debug2('eval rho slice', *t1) @@ -794,19 +794,19 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ''' with cupy.cuda.Device(device_id), _streams[device_id]: if dms is not None: - dma, dmb = dms + dma, dmb = dms dma = cupy.asarray(dma) dmb = cupy.asarray(dmb) if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff) if mo_occ is not None: mo_occ = cupy.asarray(mo_occ) - - log = logger.new_logger(mol) + assert isinstance(verbose, int) + log = logger.new_logger(mol, verbose) t0 = log.init_timer() xctype = ni._xc_type(xc_code) nao = mol.nao opt = ni.gdftopt _sorted_mol = opt._sorted_mol - + nset = dma.shape[0] nelec = np.zeros((2,nset)) excsum = np.zeros(nset) @@ -824,7 +824,7 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, grid_end = (device_id + 1) * ngrids_per_device for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, - max_memory=None, + max_memory=None, grid_range=(grid_start, grid_end)): for i in range(nset): t0 = log.init_timer() @@ -879,11 +879,13 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, excsum[i] += cupy.dot(den_a, exc[:,0]) excsum[i] += cupy.dot(den_b, exc[:,0]) t1 = log.timer_debug1('integration', *t1) - + return nelec, excsum, (vmata, vmatb) def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None): + log = logger.new_logger(mol, verbose) + t0 = log.init_timer() xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) if opt is None: @@ -912,7 +914,7 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, future = executor.submit( _nr_uks_task, ni, mol, grids, xc_code, (dma,dmb), mo_coeff, mo_occ, - verbose=verbose, device_id=device_id, hermi=hermi) + verbose=log.verbose, device_id=device_id, hermi=hermi) futures.append(future) vmata_dist = [] @@ -925,12 +927,12 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vmatb_dist.append(v[1]) nelec_dist.append(n) excsum_dist.append(e) - + vmata = reduce_to_device(vmata_dist, inplace=True) vmatb = reduce_to_device(vmatb_dist, inplace=True) vmata = opt.unsort_orbitals(vmata, axis=[1,2]) vmatb = opt.unsort_orbitals(vmatb, axis=[1,2]) - + nelec = np.sum(nelec_dist, axis=0) excsum = np.sum(excsum_dist, axis=0) @@ -949,6 +951,7 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vmata = vmata[0] vmatb = vmatb[0] vmat = cupy.asarray([vmata, vmatb]) + t0 = log.timer_debug1('nr_uks', *t0) return nelec, excsum, vmat @@ -1001,7 +1004,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, if mo1 is not None: mo1 = cupy.asarray(mo1) if occ_coeff is not None: occ_coeff = cupy.asarray(occ_coeff) if fxc is not None: fxc = cupy.asarray(fxc) - + assert isinstance(verbose, int) log = logger.new_logger(mol, verbose) xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) @@ -1074,6 +1077,8 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0, rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): + log = logger.new_logger(mol, verbose) + t0 = log.init_timer() if fxc is None: raise RuntimeError('fxc was not initialized') xctype = ni._xc_type(xc_code) @@ -1100,7 +1105,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= future = executor.submit( _nr_rks_fxc_task, ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, - verbose=verbose, hermi=hermi, device_id=device_id) + verbose=log.verbose, hermi=hermi, device_id=device_id) futures.append(future) vmat_dist = [] for future in futures: @@ -1116,7 +1121,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if len(dm_shape) == 2: vmat = vmat[0] - + t0 = log.timer_debug1('nr_rks_fxc', *t0) return cupy.asarray(vmat) def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, @@ -1281,7 +1286,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, nao, nao0 = opt.coeff.shape mol = None _sorted_mol = opt._sorted_mol - + dms = dms.reshape(-1,nao0,nao0) assert len(dms) == 1 dms = opt.sort_orbitals(dms, axis=[1,2]) @@ -1385,7 +1390,7 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, t1 = log.timer_debug2('eval rho in fxc', *t1) #rho = (cupy.hstack(rhoa), cupy.hstack(rhob)) rho = cupy.stack([cupy.hstack(rhoa), cupy.hstack(rhob)], axis=0) - t0 = log.timer_debug1('eval rho in fxc', *t0) + t0 = log.timer_debug1('eval rho in fxc', *t0) vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3] t0 = log.timer_debug1('eval fxc', *t0) return rho, vxc, fxc @@ -1512,7 +1517,7 @@ def _sparse_index(mol, coords, l_ctr_offsets): ao_loc = mol.ao_loc_nr() non0shl_idx = cupy.zeros(len(ao_loc)-1, dtype=np.int32) coords = cupy.asarray(coords) - + libgdft.GDFTscreen_index( ctypes.cast(stream.ptr, ctypes.c_void_p), ctypes.cast(non0shl_idx.data.ptr, ctypes.c_void_p), @@ -1564,7 +1569,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, grids.build(with_non0tab=False, sort_grids=True) if nao is None: nao = mol.nao - + if grid_range is None: grid_start, grid_end = 0, grids.coords.shape[0] else: @@ -1573,7 +1578,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, device_id = cupy.cuda.Device().id log.debug(f'{grid_start} - {grid_end} grids are calculated on Device {device_id}.') - + comp = (deriv+1)*(deriv+2)*(deriv+3)//6 if blksize is None: #cupy.get_default_memory_pool().free_all_blocks() @@ -1602,7 +1607,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, lookup_key = (device_id, block_id, blksize, ngrids) if lookup_key not in ni.non0ao_idx: ni.non0ao_idx[lookup_key] = _sparse_index(_sorted_mol, coords, opt.l_ctr_offsets) - + pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[lookup_key] ao_mask = eval_ao( _sorted_mol, coords, deriv, @@ -1613,7 +1618,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, gdftopt=opt, transpose=False ) - + if pad > 0: if deriv == 0: ao_mask[-pad:,:] = 0.0 diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index 3321514a..e98bbd37 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -119,7 +119,7 @@ def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, kern_counts += 1 return ejk, kern_counts, timing_counter -def _jk_energy_per_atom(mol, dm, vhfopt=None, +def _jk_energy_per_atom(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=None): ''' Computes the first-order derivatives of the energy per atom for j_factor * J_derivatives - k_factor * K_derivatives @@ -159,7 +159,7 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, future = executor.submit( _ejk_ip1_task, mol, dms, vhfopt, task_list[device_id], - j_factor=j_factor, k_factor=k_factor, verbose=verbose, + j_factor=j_factor, k_factor=k_factor, verbose=log.verbose, device_id=device_id) futures.append(future) diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index e92e7352..fb8154dc 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -145,7 +145,7 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff) if mo_occ is not None: mo_occ = cupy.asarray(mo_occ) - log = logger.new_logger(mol) + log = logger.new_logger(mol, verbose) t0 = log.init_timer() xctype = ni._xc_type(xc_code) nao = mol.nao @@ -156,13 +156,13 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices grid_start = device_id * ngrids_per_device grid_end = (device_id + 1) * ngrids_per_device - + nset = len(dms) assert nset == 1 vmat = cupy.zeros((nset,3,nao,nao)) if xctype == 'LDA': ao_deriv = 1 - for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, + for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, grid_range=(grid_start, grid_end)): for idm in range(nset): mo_coeff_mask = mo_coeff[idx,:] @@ -206,11 +206,13 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None): + log = logger.new_logger(mol, verbose) + t0 = log.init_timer() opt = getattr(ni, 'gdftopt', None) if opt is None: ni.build(mol, grids.coords) opt = ni.gdftopt - + mo_occ = cupy.asarray(dms.mo_occ) mo_coeff = cupy.asarray(dms.mo_coeff) nao = mol.nao @@ -226,7 +228,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, future = executor.submit( _get_vxc_task, ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, - verbose=verbose, device_id=device_id) + verbose=log.verbose, device_id=device_id) futures.append(future) vmat_dist = [future.result() for future in futures] vmat = reduce_to_device(vmat_dist) @@ -234,12 +236,14 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, exc = None if nset == 1: vmat = vmat[0] - + log.timer_debug1('grad vxc', *t0) # - sign because nabla_X = -nabla_x return exc, -vmat def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None): + log = logger.new_logger(mol, verbose) + t0 = log.init_timer() xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) if opt is None: @@ -249,7 +253,6 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, mo_occ = cupy.asarray(dms.mo_occ) mo_coeff = cupy.asarray(dms.mo_coeff) - mol = None _sorted_mol = opt._sorted_mol coeff = cupy.asarray(opt.coeff) nao, nao0 = coeff.shape @@ -290,6 +293,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vmat = opt.unsort_orbitals(vmat, axis=[1,2]) exc = None # - sign because nabla_X = -nabla_x + log.timer_debug1('grad nlc vxc', *t0) return exc, -vmat def _make_dR_dao_w(ao, wv): diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 4fbe6703..26929dd3 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -165,12 +165,13 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, device_id=0, verbose=0): n_dm = dms.shape[0] assert n_dm <= 2 + assert isinstance(verbose, int) nao, _ = vhfopt.coeff.shape uniq_l_ctr = vhfopt.uniq_l_ctr uniq_l = uniq_l_ctr[:,0] l_ctr_bas_loc = vhfopt.l_ctr_offsets l_symb = [lib.param.ANGULAR[i] for i in uniq_l] - + kern1 = libvhf_rys.RYS_per_atom_jk_ip2_type12 kern2 = libvhf_rys.RYS_per_atom_jk_ip2_type3 @@ -299,7 +300,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non future = executor.submit( _ejk_ip2_task, mol, dms, vhfopt, task_list[device_id], - j_factor=j_factor, k_factor=k_factor, verbose=verbose, + j_factor=j_factor, k_factor=k_factor, verbose=log.verbose, device_id=device_id) futures.append(future) @@ -318,7 +319,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non log.debug1('%s wall time %.2f', llll, t) ejk = reduce_to_device(ejk_dist, inplace=True) - + timing_collection = {} kern_counts = 0 @@ -379,6 +380,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, device_id=0, with_j=True, with_k=True, verbose=0): + assert isinstance(verbose, int) nao, _ = vhfopt.coeff.shape natm = mol.natm nbas = mol.nbas @@ -418,7 +420,7 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, tril_tile_mappings = _make_tril_tile_mappings( l_ctr_bas_loc, vhfopt.tile_q_cond, log_cutoff-log_max_dm, 1) workers = gpu_specs['multiProcessorCount'] - QUEUE_DEPTH = 65536 + QUEUE_DEPTH = 65536 pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16) info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) @@ -504,14 +506,14 @@ def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): atom0, atom1 = atoms_slice init_constant(mol) - + uniq_l_ctr = vhfopt.uniq_l_ctr uniq_l = uniq_l_ctr[:,0] assert uniq_l.max() <= LMAX nbas = mol.nbas assert vhfopt.tile_q_cond.shape == (nbas, nbas) - + n_groups = len(uniq_l_ctr) tasks = [(i,j) for i in range(n_groups) for j in range(n_groups)] tasks = np.array(tasks) @@ -526,7 +528,7 @@ def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): future = executor.submit( _build_jk_ip1_task, mol, dms, vhfopt, task_list[device_id], atoms_slice, - with_j=with_j, with_k=with_k, verbose=verbose, + with_j=with_j, with_k=with_k, verbose=log.verbose, device_id=device_id) futures.append(future) @@ -831,7 +833,7 @@ def _e_hcore_generator(hessobj, dm): h1aa, h1ab = hessobj.get_hcore(mol) h1aa = cupy.asarray(h1aa) h1ab = cupy.asarray(h1ab) - + hcore = cupy.empty((3,3,nao,nao)) t1 = log.timer_debug1('get_hcore', *t1) def get_hcore(iatm, jatm): diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index f34924d4..4c9b5a0d 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -329,13 +329,13 @@ def _d1d2_dot_(vmat, mol, ao1, ao2, mask, ao_loc, dR1_on_bra=True): shls_slice, ao_loc) #vmat += contract('yig,xjg->xyij', ao1, ao2) -def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id=0): +def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id=0, verbose=0): mol = hessobj.mol mf = hessobj.base ni = mf._numint nao = mol.nao opt = ni.gdftopt - + _sorted_mol = opt._sorted_mol xctype = ni._xc_type(mf.xc) aoslices = mol.aoslice_by_atom() @@ -346,16 +346,15 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices grid_start = device_id * ngrids_per_device grid_end = (device_id + 1) * ngrids_per_device - + with cupy.cuda.Device(device_id), _streams[device_id]: - log = logger.new_logger(mol, mol.verbose) + log = logger.new_logger(mol, verbose) + t1 = t0 = log.init_timer() mo_occ = cupy.asarray(mo_occ) mo_coeff = cupy.asarray(mo_coeff) dm0 = mf.make_rdm1(mo_coeff, mo_occ) dm0_sorted = opt.sort_orbitals(dm0, axis=[0,1]) coeff = cupy.asarray(opt.coeff) - log = logger.new_logger(mol, mol.verbose) - t1 = t0 = log.init_timer() vmat_dm = cupy.zeros((_sorted_mol.natm,3,3,nao)) ipip = cupy.zeros((3,3,nao,nao)) if xctype == 'LDA': @@ -389,7 +388,7 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id vmat_dm[ia][:,:,mask] += contract('yjg,xjg->xyj', ao_mask[1:4], aow) ao_dm0 = aow = None t1 = log.timer_debug2('integration', *t1) - + vmat_dm = opt.unsort_orbitals(vmat_dm, axis=[3]) for ia in range(_sorted_mol.natm): p0, p1 = aoslices[ia][2:] @@ -528,7 +527,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): future = executor.submit( _get_vxc_deriv2_task, hessobj, grids, mo_coeff, mo_occ, max_memory, - device_id=device_id) + device_id=device_id, verbose=mol.verbose) futures.append(future) vmat_dm_dist = [future.result() for future in futures] vmat_dm = reduce_to_device(vmat_dm_dist, inplace=True) @@ -540,7 +539,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ni = mf._numint nao = mol.nao opt = ni.gdftopt - + _sorted_mol = opt._sorted_mol xctype = ni._xc_type(mf.xc) aoslices = mol.aoslice_by_atom() @@ -551,7 +550,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices grid_start = device_id * ngrids_per_device grid_end = (device_id + 1) * ngrids_per_device - + with cupy.cuda.Device(device_id), _streams[device_id]: mo_occ = cupy.asarray(mo_occ) mo_coeff = cupy.asarray(mo_coeff) @@ -567,7 +566,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id t1 = t0 = log.init_timer() if xctype == 'LDA': ao_deriv = 1 - for ao, mask, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, + for ao, mask, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, grid_range=(grid_start, grid_end)): ao = contract('nip,ij->njp', ao, coeff[mask]) rho = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff, mo_occ, mask, xctype) @@ -645,7 +644,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id mow = [numint._scale_ao(mo[:4], wv[i,:4]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, mo[0].T) vmat[ia] += rks_grad._d1_dot_(mow, ao[0].T).transpose([0,2,1]) - + for j in range(1, 4): aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)] mow = [numint._scale_ao(mo[j], wv[i,4]) for i in range(3)] diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu index 1af3e63a..29c58926 100644 --- a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu +++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu @@ -5036,6 +5036,9 @@ int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds, blocks.x = (bounds->npairs_ij + 255) / 256; blocks.y = (bounds->npairs_kl + 127) / 128; md_j_1_2<<<blocks, threads, 7424*sizeof(double)>>>(*envs, *jk, *bounds); break; + // TODO: dynamically select kernels based on max shared memory size + // >64KB shared memory is needed for the kernels below + /* case 18: // lij=2, lkl=0 blocks.x = (bounds->npairs_ij + 255) / 256; blocks.y = (bounds->npairs_kl + 255) / 256; @@ -5060,6 +5063,7 @@ int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds, blocks.x = (bounds->npairs_ij + 63) / 64; blocks.y = (bounds->npairs_kl + 255) / 256; md_j_4_0<<<blocks, threads, 7808*sizeof(double)>>>(*envs, *jk, *bounds); break; + */ default: return 0; } return 1;