diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index f1128fc5..4eb534e3 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -13,10 +13,35 @@ permissions:
   contents: read
 
 jobs:
-  build:
-
-    runs-on: self-hosted
+  single-gpu:
+    runs-on: [self-hosted, Linux, X64, v100]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Install dependencies
+      run: |
+        pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+        python3 -m pip install --upgrade pip
+        pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion
+        pip3 install pyscf --upgrade
+        pip3 install git+https://github.com/pyscf/properties --upgrade
+        pip3 install numpy --upgrade
+        pip3 install h5py --upgrade
+        pip3 install gpu4pyscf-libxc-cuda12x --upgrade
+        pip3 install cupy-cuda12x --upgrade
+        git config --global core.compression 9
+    - name: Build GPU4PySCF
+      run: |
+        export CUDA_HOME=/usr/local/cuda
+        export CMAKE_CONFIGURE_ARGS="-DBUILD_LIBXC=OFF -DCUDA_ARCHITECTURES=70-real -DBUILD_CUTLASS=ON"
+        sh build.sh
+    - name: Test with pytest
+      run: |
+        echo $GITHUB_WORKSPACE
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        pytest -m "not smoke" --cov=$GITHUB_WORKSPACE
 
+  multi-gpu:
+    runs-on: [self-hosted, Linux, X64, 2T4]
     steps:
     - uses: actions/checkout@v3
     - name: Install dependencies
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index 872268d7..e5185498 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -86,7 +86,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         j2c = cupy.asarray(j2c_cpu, order='C')
         t0 = log.timer_debug1('2c2e', *t0)
         intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
-        intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, 
+        intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True,
                      group_size=GROUP_SIZE, group_size_aux=GROUP_SIZE)
         log.timer_debug1('prepare intopt', *t0)
         self.j2c = j2c.copy()
@@ -105,7 +105,7 @@ def build(self, direct_scf_tol=1e-14, omega=None):
         naux = self.naux = self.cd_low.shape[1]
         log.debug('size of aux basis %d', naux)
 
-        self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low, 
+        self._cderi = cholesky_eri_gpu(intopt, mol, auxmol, self.cd_low,
                                        omega=omega, use_gpu_memory=self.use_gpu_memory)
         log.timer_debug1('cholesky_eri', *t0)
         self.intopt = intopt
@@ -144,8 +144,8 @@ def get_blksize(self, extra=0, nao=None):
         return blksize
 
     def loop(self, blksize=None, unpack=True):
-        ''' loop over cderi for the current device 
-            and unpack the CDERI in (Lij) format 
+        ''' loop over cderi for the current device
+            and unpack the CDERI in (Lij) format
         '''
         device_id = cupy.cuda.Device().id
         cderi_sparse = self._cderi[device_id]
@@ -177,10 +177,10 @@ def loop(self, blksize=None, unpack=True):
             yield buf2, buf.T
             if isinstance(cderi_sparse, np.ndarray):
                 cupy.cuda.Device().synchronize()
-            
+
             if buf_prefetch is not None:
                 buf = buf_prefetch
-            
+
     def reset(self, mol=None):
         '''Reset mol and clean up relevant attributes for scanner mode'''
         if mol is not None:
@@ -198,7 +198,7 @@ def reset(self, mol=None):
     get_ao_eri = get_eri = NotImplemented
     get_mo_eri = ao2mo = NotImplemented
 
-def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, 
+def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
                      omega=None, sr_only=False, use_gpu_memory=True):
     '''
     Returns:
@@ -210,13 +210,13 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
 
     # Available memory on Device 0.
     avail_mem = get_avail_mem()
-    
+
     if use_gpu_memory:
         # CDERI will be equally distributed to the devices
         # Other devices usually have more memory available than Device 0
         # CDERI will use up to 40% of the available memory
         use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
-    
+
     if use_gpu_memory:
         log.debug("Saving CDERI on GPU")
     else:
@@ -235,7 +235,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
             _cderi[device_id] = cderi_blk
 
     npairs_per_ctr = [len(intopt.ao_pairs_row[cp_ij_id]) for cp_ij_id in range(len(intopt.log_qs))]
-    
+
     npairs_per_ctr = np.array(npairs_per_ctr)
     total_task_list = np.argsort(npairs_per_ctr)
     task_list_per_device = []
@@ -253,13 +253,13 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
             future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi,
                                      omega=omega, sr_only=sr_only, device_id=device_id)
             futures.append(future)
-    
+
     for future in futures:
         future.result()
-    
+
     if not use_gpu_memory:
         cupy.cuda.Device().synchronize()
-    
+
     return _cderi
 
 def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0):
@@ -273,6 +273,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
     pairs_loc = np.append(0, np.cumsum(npairs))
     blksize = (naux + _num_devices - 1) // _num_devices
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(mol.verbose, int)
         log = logger.new_logger(mol, mol.verbose)
         t1 = log.init_timer()
         cd_low_tag = cd_low.tag
@@ -320,7 +321,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
 
             row = intopt.ao_pairs_row[cp_ij_id] - i0
             col = intopt.ao_pairs_col[cp_ij_id] - j0
-            
+
             ints_slices_f= cupy.empty([naoaux,len(row)], order='F')
             ints_slices_f[:] = ints_slices[:,col,row]
             ints_slices = None
@@ -330,12 +331,12 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             elif cd_low_tag == 'cd':
                 cderi_block = solve_triangular(cd_low, ints_slices_f, lower=True, overwrite_b=True)
             else:
-                RuntimeError('Tag is not found in lower triangular matrix.')
+                raise RuntimeError('Tag is not found in lower triangular matrix.')
             t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq} on Device {device_id}', *t1)
 
-            # TODO: 
+            # TODO:
             # 1) async data transfer
-            # 2) auxiliary basis in the last dimension 
+            # 2) auxiliary basis in the last dimension
 
             # if CDERI is saved on CPU
             ij0 = pairs_loc[cp_ij_id]
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
index e8c8e37a..984ea2e2 100644
--- a/gpu4pyscf/df/df_jk.py
+++ b/gpu4pyscf/df/df_jk.py
@@ -249,6 +249,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
     ''' Calculate J and K matrices on single GPU
     '''
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
         dms = cupy.asarray(dms)
@@ -313,6 +314,7 @@ def _jk_task_with_mo1(dfobj, dms, mo1s, occ_coeffs,
     '''
     vj = vk = None
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
         dms = cupy.asarray(dms)
@@ -373,6 +375,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
     ''' Calculate J and K matrices with density matrix
     '''
     with cupy.cuda.Device(device_id), _streams[device_id]:
+        assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
         dms = cupy.asarray(dms)
@@ -404,7 +407,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
                 for k in range(nset):
                     rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
                     #vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
-                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))            
+                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
         if with_j:
             vj = cupy.zeros(dms_shape)
             vj[:,rows,cols] = vj_sparse
@@ -437,7 +440,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
 
     assert nao == dfobj.nao
     intopt = dfobj.intopt
-    
+
     nao = dms_tag.shape[-1]
     dms = dms_tag.reshape([-1,nao,nao])
     intopt = dfobj.intopt
@@ -456,7 +459,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
         with ThreadPoolExecutor(max_workers=_num_devices) as executor:
             for device_id in range(_num_devices):
                 future = executor.submit(
-                    _jk_task_with_mo, 
+                    _jk_task_with_mo,
                     dfobj, dms, mo_coeff, mo_occ,
                     hermi=hermi, device_id=device_id,
                     with_j=with_j, with_k=with_k)
@@ -477,7 +480,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
         with ThreadPoolExecutor(max_workers=_num_devices) as executor:
             for device_id in range(_num_devices):
                 future = executor.submit(
-                    _jk_task_with_mo1, 
+                    _jk_task_with_mo1,
                     dfobj, dms, mo1s, occ_coeffs,
                     hermi=hermi, device_id=device_id,
                     with_j=with_j, with_k=with_k)
diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py
index a8039e54..95a5f021 100644
--- a/gpu4pyscf/df/grad/jk.py
+++ b/gpu4pyscf/df/grad/jk.py
@@ -25,6 +25,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
     rhoj = rhok = None
     with cupy.cuda.Device(device_id), _streams[device_id]:
         log = logger.new_logger(with_df.mol, with_df.verbose)
+        assert isinstance(with_df.verbose, int)
         t0 = log.init_timer()
         dm = cupy.asarray(dm)
         orbo = cupy.asarray(orbo)
@@ -34,7 +35,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
         cols = with_df.intopt.cderi_col
         dm_sparse = dm[rows, cols]
         dm_sparse[with_df.intopt.cderi_diag] *= .5
-        
+
         blksize = with_df.get_blksize()
         if with_j:
             rhoj = cupy.empty([naux_slice])
@@ -65,18 +66,18 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
                 _jk_task, with_df, dm, orbo,
                 with_j=with_j, with_k=with_k, device_id=device_id)
             futures.append(future)
-    
+
     rhoj_total = []
     rhok_total = []
     for future in futures:
         rhoj, rhok = future.result()
         rhoj_total.append(rhoj)
         rhok_total.append(rhok)
-        
+
     rhoj = rhok = None
     if with_j:
         rhoj = concatenate(rhoj_total)
     if with_k:
         rhok = concatenate(rhok_total)
-    
+
     return rhoj, rhok
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 5ff64fc2..2e5a8da4 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -91,7 +91,7 @@ def eval_ao(mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=
     coords = cupy.asarray(coords.T, order='C')
     comp = (deriv+1)*(deriv+2)*(deriv+3)//6
     stream = cupy.cuda.get_current_stream()
-    
+
     # ao must be set to zero due to implementation
     if deriv > 1:
         if out is None:
@@ -123,7 +123,7 @@ def eval_ao(mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=
 
     if transpose:
         out = out.transpose(0,2,1)
-    
+
     if deriv == 0:
         out = out[0]
     return out
@@ -397,7 +397,7 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars):
     vxc[1,threshind] = 1.5*W*dW0dG
     return exc,vxc
 
-def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, 
+def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
                  verbose=None, with_lapl=False, grid_range=(), device_id=0, hermi=1):
     ''' nr_rks task on given device
     '''
@@ -405,8 +405,8 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
         if dms is not None: dms = cupy.asarray(dms)
         if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
         if mo_occ is not None: mo_occ = cupy.asarray(mo_occ)
-
-        log = logger.new_logger(mol)
+        assert isinstance(verbose, int)
+        log = logger.new_logger(mol, verbose)
         t0 = log.init_timer()
         xctype = ni._xc_type(xc_code)
         nao = mol.nao
@@ -417,13 +417,13 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
             ao_deriv = 0
         else:
             ao_deriv = 1
-        
+
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
         grid_start = device_id * ngrids_per_device
         grid_end = (device_id + 1) * ngrids_per_device
         ngrids_local = grid_end - grid_start
-        
+
         weights = cupy.empty([ngrids_local])
         if xctype == 'LDA':
             rho_tot = cupy.empty([nset,1,ngrids_local])
@@ -434,22 +434,22 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
 
         p0 = p1 = 0
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
-                                                     max_memory=None, 
+                                                     max_memory=None,
                                                      grid_range=(grid_start, grid_end)):
             p1 = p0 + weight.size
             weights[p0:p1] = weight
             for i in range(nset):
                 if mo_coeff is None:
-                    rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], 
+                    rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx],
                                                 xctype=xctype, hermi=hermi, with_lapl=with_lapl)
                 else:
                     assert hermi == 1
                     mo_coeff_mask = mo_coeff[idx,:]
-                    rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, 
+                    rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ,
                                                 None, xctype, with_lapl)
             p0 = p1
         t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0)
-        
+
         # libxc calls are still running on default stream
         nelec = cupy.zeros(nset)
         excsum = cupy.zeros(nset)
@@ -471,7 +471,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
         vmat = cupy.zeros((nset, nao, nao))
         p0 = p1 = 0
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
-                                                     max_memory=None, 
+                                                     max_memory=None,
                                                      grid_range=(grid_start, grid_end)):
             p1 = p0 + weight.size
             for i in range(nset):
@@ -523,7 +523,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             future = executor.submit(
                 _nr_rks_task,
                 ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
-                verbose=verbose, device_id=device_id, hermi=hermi)
+                verbose=log.verbose, device_id=device_id, hermi=hermi)
             futures.append(future)
     vmat_dist = []
     nelec_dist = []
@@ -552,7 +552,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     t0 = log.timer_debug1('nr_rks', *t0)
     return nelec, excsum, vmat
 
-def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ, 
+def eval_rho_group(mol, ao_group, mo_coeff_group, mo_occ,
                    non0tab=None, xctype='LDA',
                    with_lapl=False, verbose=None, out=None):
     groups = len(ao_group)
@@ -684,18 +684,18 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         rho_tot = cupy.empty([nset,5,ngrids])
     p0 = p1 = 0
     t1 = t0 = log.init_timer()
-    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, 
+    for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
                                                  max_memory=max_memory):
         p1 = p0 + weight.size
         for i in range(nset):
             if mo_coeff is None:
                 rho_tot[i,:,p0:p1] = eval_rho(
-                    _sorted_mol, ao_mask, dms[i][idx[:,None],idx], 
+                    _sorted_mol, ao_mask, dms[i][idx[:,None],idx],
                     xctype=xctype, hermi=1)
             else:
                 assert hermi == 1
                 mo_coeff_mask = mo_coeff[idx,:]
-                rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, 
+                rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask,
                                                mo_occ, None, xctype)
         p0 = p1
         t1 = log.timer_debug2('eval rho slice', *t1)
@@ -794,19 +794,19 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
     '''
     with cupy.cuda.Device(device_id), _streams[device_id]:
         if dms is not None:
-            dma, dmb = dms 
+            dma, dmb = dms
             dma = cupy.asarray(dma)
             dmb = cupy.asarray(dmb)
         if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
         if mo_occ is not None: mo_occ = cupy.asarray(mo_occ)
-        
-        log = logger.new_logger(mol)
+        assert isinstance(verbose, int)
+        log = logger.new_logger(mol, verbose)
         t0 = log.init_timer()
         xctype = ni._xc_type(xc_code)
         nao = mol.nao
         opt = ni.gdftopt
         _sorted_mol = opt._sorted_mol
-        
+
         nset = dma.shape[0]
         nelec = np.zeros((2,nset))
         excsum = np.zeros(nset)
@@ -824,7 +824,7 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
         grid_end = (device_id + 1) * ngrids_per_device
 
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
-                                                     max_memory=None, 
+                                                     max_memory=None,
                                                      grid_range=(grid_start, grid_end)):
             for i in range(nset):
                 t0 = log.init_timer()
@@ -879,11 +879,13 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
                 excsum[i] += cupy.dot(den_a, exc[:,0])
                 excsum[i] += cupy.dot(den_b, exc[:,0])
                 t1 = log.timer_debug1('integration', *t1)
-    
+
     return nelec, excsum, (vmata, vmatb)
 
 def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
            max_memory=2000, verbose=None):
+    log = logger.new_logger(mol, verbose)
+    t0 = log.init_timer()
     xctype = ni._xc_type(xc_code)
     opt = getattr(ni, 'gdftopt', None)
     if opt is None:
@@ -912,7 +914,7 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             future = executor.submit(
                 _nr_uks_task,
                 ni, mol, grids, xc_code, (dma,dmb), mo_coeff, mo_occ,
-                verbose=verbose, device_id=device_id, hermi=hermi)
+                verbose=log.verbose, device_id=device_id, hermi=hermi)
             futures.append(future)
 
     vmata_dist = []
@@ -925,12 +927,12 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         vmatb_dist.append(v[1])
         nelec_dist.append(n)
         excsum_dist.append(e)
-        
+
     vmata = reduce_to_device(vmata_dist, inplace=True)
     vmatb = reduce_to_device(vmatb_dist, inplace=True)
     vmata = opt.unsort_orbitals(vmata, axis=[1,2])
     vmatb = opt.unsort_orbitals(vmatb, axis=[1,2])
-    
+
     nelec = np.sum(nelec_dist, axis=0)
     excsum = np.sum(excsum_dist, axis=0)
 
@@ -949,6 +951,7 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         vmata = vmata[0]
         vmatb = vmatb[0]
     vmat = cupy.asarray([vmata, vmatb])
+    t0 = log.timer_debug1('nr_uks', *t0)
     return nelec, excsum, vmat
 
 
@@ -1001,7 +1004,7 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
         if mo1 is not None: mo1 = cupy.asarray(mo1)
         if occ_coeff is not None: occ_coeff = cupy.asarray(occ_coeff)
         if fxc is not None: fxc = cupy.asarray(fxc)
-
+        assert isinstance(verbose, int)
         log = logger.new_logger(mol, verbose)
         xctype = ni._xc_type(xc_code)
         opt = getattr(ni, 'gdftopt', None)
@@ -1074,6 +1077,8 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
 
 def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0,
                rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
+    log = logger.new_logger(mol, verbose)
+    t0 = log.init_timer()
     if fxc is None:
         raise RuntimeError('fxc was not initialized')
     xctype = ni._xc_type(xc_code)
@@ -1100,7 +1105,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
             future = executor.submit(
                 _nr_rks_fxc_task,
                 ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
-                verbose=verbose, hermi=hermi, device_id=device_id)
+                verbose=log.verbose, hermi=hermi, device_id=device_id)
             futures.append(future)
     vmat_dist = []
     for future in futures:
@@ -1116,7 +1121,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
 
     if len(dm_shape) == 2:
         vmat = vmat[0]
-
+    t0 = log.timer_debug1('nr_rks_fxc', *t0)
     return cupy.asarray(vmat)
 
 def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None,
@@ -1281,7 +1286,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     nao, nao0 = opt.coeff.shape
     mol = None
     _sorted_mol = opt._sorted_mol
-    
+
     dms = dms.reshape(-1,nao0,nao0)
     assert len(dms) == 1
     dms = opt.sort_orbitals(dms, axis=[1,2])
@@ -1385,7 +1390,7 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
             t1 = log.timer_debug2('eval rho in fxc', *t1)
         #rho = (cupy.hstack(rhoa), cupy.hstack(rhob))
         rho = cupy.stack([cupy.hstack(rhoa), cupy.hstack(rhob)], axis=0)
-        t0 = log.timer_debug1('eval rho in fxc', *t0)    
+        t0 = log.timer_debug1('eval rho in fxc', *t0)
     vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
     t0 = log.timer_debug1('eval fxc', *t0)
     return rho, vxc, fxc
@@ -1512,7 +1517,7 @@ def _sparse_index(mol, coords, l_ctr_offsets):
     ao_loc = mol.ao_loc_nr()
     non0shl_idx = cupy.zeros(len(ao_loc)-1, dtype=np.int32)
     coords = cupy.asarray(coords)
-    
+
     libgdft.GDFTscreen_index(
         ctypes.cast(stream.ptr, ctypes.c_void_p),
         ctypes.cast(non0shl_idx.data.ptr, ctypes.c_void_p),
@@ -1564,7 +1569,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
         grids.build(with_non0tab=False, sort_grids=True)
     if nao is None:
         nao = mol.nao
-    
+
     if grid_range is None:
         grid_start, grid_end = 0, grids.coords.shape[0]
     else:
@@ -1573,7 +1578,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
 
     device_id = cupy.cuda.Device().id
     log.debug(f'{grid_start} - {grid_end} grids are calculated on Device {device_id}.')
-    
+
     comp = (deriv+1)*(deriv+2)*(deriv+3)//6
     if blksize is None:
         #cupy.get_default_memory_pool().free_all_blocks()
@@ -1602,7 +1607,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
             lookup_key = (device_id, block_id, blksize, ngrids)
             if lookup_key not in ni.non0ao_idx:
                 ni.non0ao_idx[lookup_key] = _sparse_index(_sorted_mol, coords, opt.l_ctr_offsets)
-            
+
             pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[lookup_key]
             ao_mask = eval_ao(
                 _sorted_mol, coords, deriv,
@@ -1613,7 +1618,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
                 gdftopt=opt,
                 transpose=False
             )
-            
+
             if pad > 0:
                 if deriv == 0:
                     ao_mask[-pad:,:] = 0.0
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
index 3321514a..e98bbd37 100644
--- a/gpu4pyscf/grad/rhf.py
+++ b/gpu4pyscf/grad/rhf.py
@@ -119,7 +119,7 @@ def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
                         kern_counts += 1
     return ejk, kern_counts, timing_counter
 
-def _jk_energy_per_atom(mol, dm, vhfopt=None, 
+def _jk_energy_per_atom(mol, dm, vhfopt=None,
                         j_factor=1., k_factor=1., verbose=None):
     ''' Computes the first-order derivatives of the energy per atom for
         j_factor * J_derivatives - k_factor * K_derivatives
@@ -159,7 +159,7 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None,
             future = executor.submit(
                 _ejk_ip1_task,
                 mol, dms, vhfopt, task_list[device_id],
-                j_factor=j_factor, k_factor=k_factor, verbose=verbose, 
+                j_factor=j_factor, k_factor=k_factor, verbose=log.verbose,
                 device_id=device_id)
             futures.append(future)
 
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index e92e7352..fb8154dc 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -145,7 +145,7 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
         if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
         if mo_occ is not None: mo_occ = cupy.asarray(mo_occ)
 
-        log = logger.new_logger(mol)
+        log = logger.new_logger(mol, verbose)
         t0 = log.init_timer()
         xctype = ni._xc_type(xc_code)
         nao = mol.nao
@@ -156,13 +156,13 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
         grid_start = device_id * ngrids_per_device
         grid_end = (device_id + 1) * ngrids_per_device
-        
+
         nset = len(dms)
         assert nset == 1
         vmat = cupy.zeros((nset,3,nao,nao))
         if xctype == 'LDA':
             ao_deriv = 1
-            for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, 
+            for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None,
                                                          grid_range=(grid_start, grid_end)):
                 for idm in range(nset):
                     mo_coeff_mask = mo_coeff[idx,:]
@@ -206,11 +206,13 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
 
 def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             max_memory=2000, verbose=None):
+    log = logger.new_logger(mol, verbose)
+    t0 = log.init_timer()
     opt = getattr(ni, 'gdftopt', None)
     if opt is None:
         ni.build(mol, grids.coords)
         opt = ni.gdftopt
-    
+
     mo_occ = cupy.asarray(dms.mo_occ)
     mo_coeff = cupy.asarray(dms.mo_coeff)
     nao = mol.nao
@@ -226,7 +228,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             future = executor.submit(
                 _get_vxc_task,
                 ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
-                verbose=verbose, device_id=device_id)
+                verbose=log.verbose, device_id=device_id)
             futures.append(future)
     vmat_dist = [future.result() for future in futures]
     vmat = reduce_to_device(vmat_dist)
@@ -234,12 +236,14 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     exc = None
     if nset == 1:
         vmat = vmat[0]
-
+    log.timer_debug1('grad vxc', *t0)
     # - sign because nabla_X = -nabla_x
     return exc, -vmat
 
 def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 max_memory=2000, verbose=None):
+    log = logger.new_logger(mol, verbose)
+    t0 = log.init_timer()
     xctype = ni._xc_type(xc_code)
     opt = getattr(ni, 'gdftopt', None)
     if opt is None:
@@ -249,7 +253,6 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     mo_occ = cupy.asarray(dms.mo_occ)
     mo_coeff = cupy.asarray(dms.mo_coeff)
 
-    mol = None
     _sorted_mol = opt._sorted_mol
     coeff = cupy.asarray(opt.coeff)
     nao, nao0 = coeff.shape
@@ -290,6 +293,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     vmat = opt.unsort_orbitals(vmat, axis=[1,2])
     exc = None
     # - sign because nabla_X = -nabla_x
+    log.timer_debug1('grad nlc vxc', *t0)
     return exc, -vmat
 
 def _make_dR_dao_w(ao, wv):
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index 4fbe6703..26929dd3 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -165,12 +165,13 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
                   device_id=0, verbose=0):
     n_dm = dms.shape[0]
     assert n_dm <= 2
+    assert isinstance(verbose, int)
     nao, _ = vhfopt.coeff.shape
     uniq_l_ctr = vhfopt.uniq_l_ctr
     uniq_l = uniq_l_ctr[:,0]
     l_ctr_bas_loc = vhfopt.l_ctr_offsets
     l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
-    
+
     kern1 = libvhf_rys.RYS_per_atom_jk_ip2_type12
     kern2 = libvhf_rys.RYS_per_atom_jk_ip2_type3
 
@@ -299,7 +300,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
             future = executor.submit(
                 _ejk_ip2_task,
                 mol, dms, vhfopt, task_list[device_id],
-                j_factor=j_factor, k_factor=k_factor, verbose=verbose, 
+                j_factor=j_factor, k_factor=k_factor, verbose=log.verbose,
                 device_id=device_id)
             futures.append(future)
 
@@ -318,7 +319,7 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non
             log.debug1('%s wall time %.2f', llll, t)
 
     ejk = reduce_to_device(ejk_dist, inplace=True)
-    
+
     timing_collection = {}
     kern_counts = 0
 
@@ -379,6 +380,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
 
 def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
                        device_id=0, with_j=True, with_k=True, verbose=0):
+    assert isinstance(verbose, int)
     nao, _ = vhfopt.coeff.shape
     natm = mol.natm
     nbas = mol.nbas
@@ -418,7 +420,7 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice,
         tril_tile_mappings = _make_tril_tile_mappings(
             l_ctr_bas_loc, vhfopt.tile_q_cond, log_cutoff-log_max_dm, 1)
         workers = gpu_specs['multiProcessorCount']
-        QUEUE_DEPTH = 65536 
+        QUEUE_DEPTH = 65536
         pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
@@ -504,14 +506,14 @@ def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
     atom0, atom1 = atoms_slice
 
     init_constant(mol)
- 
+
     uniq_l_ctr = vhfopt.uniq_l_ctr
     uniq_l = uniq_l_ctr[:,0]
     assert uniq_l.max() <= LMAX
 
     nbas = mol.nbas
     assert vhfopt.tile_q_cond.shape == (nbas, nbas)
-    
+
     n_groups = len(uniq_l_ctr)
     tasks = [(i,j) for i in range(n_groups) for j in range(n_groups)]
     tasks = np.array(tasks)
@@ -526,7 +528,7 @@ def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None):
             future = executor.submit(
                 _build_jk_ip1_task,
                 mol, dms, vhfopt, task_list[device_id], atoms_slice,
-                with_j=with_j, with_k=with_k, verbose=verbose, 
+                with_j=with_j, with_k=with_k, verbose=log.verbose,
                 device_id=device_id)
             futures.append(future)
 
@@ -831,7 +833,7 @@ def _e_hcore_generator(hessobj, dm):
     h1aa, h1ab = hessobj.get_hcore(mol)
     h1aa = cupy.asarray(h1aa)
     h1ab = cupy.asarray(h1ab)
-    
+
     hcore = cupy.empty((3,3,nao,nao))
     t1 = log.timer_debug1('get_hcore', *t1)
     def get_hcore(iatm, jatm):
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index f34924d4..4c9b5a0d 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -329,13 +329,13 @@ def _d1d2_dot_(vmat, mol, ao1, ao2, mask, ao_loc, dR1_on_bra=True):
                                                  shls_slice, ao_loc)
         #vmat += contract('yig,xjg->xyij', ao1, ao2)
 
-def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id=0):
+def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id=0, verbose=0):
     mol = hessobj.mol
     mf = hessobj.base
     ni = mf._numint
     nao = mol.nao
     opt = ni.gdftopt
-    
+
     _sorted_mol = opt._sorted_mol
     xctype = ni._xc_type(mf.xc)
     aoslices = mol.aoslice_by_atom()
@@ -346,16 +346,15 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
     ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
     grid_start = device_id * ngrids_per_device
     grid_end = (device_id + 1) * ngrids_per_device
-    
+
     with cupy.cuda.Device(device_id), _streams[device_id]:
-        log = logger.new_logger(mol, mol.verbose)
+        log = logger.new_logger(mol, verbose)
+        t1 = t0 = log.init_timer()
         mo_occ = cupy.asarray(mo_occ)
         mo_coeff = cupy.asarray(mo_coeff)
         dm0 = mf.make_rdm1(mo_coeff, mo_occ)
         dm0_sorted = opt.sort_orbitals(dm0, axis=[0,1])
         coeff = cupy.asarray(opt.coeff)
-        log = logger.new_logger(mol, mol.verbose)
-        t1 = t0 = log.init_timer()
         vmat_dm = cupy.zeros((_sorted_mol.natm,3,3,nao))
         ipip = cupy.zeros((3,3,nao,nao))
         if xctype == 'LDA':
@@ -389,7 +388,7 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
                     vmat_dm[ia][:,:,mask] += contract('yjg,xjg->xyj', ao_mask[1:4], aow)
                 ao_dm0 = aow = None
                 t1 = log.timer_debug2('integration', *t1)
-            
+
             vmat_dm = opt.unsort_orbitals(vmat_dm, axis=[3])
             for ia in range(_sorted_mol.natm):
                 p0, p1 = aoslices[ia][2:]
@@ -528,7 +527,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
             future = executor.submit(
                 _get_vxc_deriv2_task,
                 hessobj, grids, mo_coeff, mo_occ, max_memory,
-                device_id=device_id)
+                device_id=device_id, verbose=mol.verbose)
             futures.append(future)
     vmat_dm_dist = [future.result() for future in futures]
     vmat_dm = reduce_to_device(vmat_dm_dist, inplace=True)
@@ -540,7 +539,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
     ni = mf._numint
     nao = mol.nao
     opt = ni.gdftopt
-    
+
     _sorted_mol = opt._sorted_mol
     xctype = ni._xc_type(mf.xc)
     aoslices = mol.aoslice_by_atom()
@@ -551,7 +550,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
     ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
     grid_start = device_id * ngrids_per_device
     grid_end = (device_id + 1) * ngrids_per_device
-    
+
     with cupy.cuda.Device(device_id), _streams[device_id]:
         mo_occ = cupy.asarray(mo_occ)
         mo_coeff = cupy.asarray(mo_coeff)
@@ -567,7 +566,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
         t1 = t0 = log.init_timer()
         if xctype == 'LDA':
             ao_deriv = 1
-            for ao, mask, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None, 
+            for ao, mask, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, None,
                                                      grid_range=(grid_start, grid_end)):
                 ao = contract('nip,ij->njp', ao, coeff[mask])
                 rho = numint.eval_rho2(_sorted_mol, ao[0], mo_coeff, mo_occ, mask, xctype)
@@ -645,7 +644,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
                     mow = [numint._scale_ao(mo[:4], wv[i,:4]) for i in range(3)]
                     vmat[ia] += rks_grad._d1_dot_(aow, mo[0].T)
                     vmat[ia] += rks_grad._d1_dot_(mow, ao[0].T).transpose([0,2,1])
-                    
+
                     for j in range(1, 4):
                         aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)]
                         mow = [numint._scale_ao(mo[j], wv[i,4]) for i in range(3)]
diff --git a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
index 1af3e63a..29c58926 100644
--- a/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
+++ b/gpu4pyscf/lib/gvhf-md/unrolled_md_j.cu
@@ -5036,6 +5036,9 @@ int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds,
         blocks.x = (bounds->npairs_ij + 255) / 256;
         blocks.y = (bounds->npairs_kl + 127) / 128;
         md_j_1_2<<<blocks, threads, 7424*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    // TODO: dynamically select kernels based on max shared memory size
+    // >64KB shared memory is needed for the kernels below
+    /*
     case 18: // lij=2, lkl=0
         blocks.x = (bounds->npairs_ij + 255) / 256;
         blocks.y = (bounds->npairs_kl + 255) / 256;
@@ -5060,6 +5063,7 @@ int md_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds,
         blocks.x = (bounds->npairs_ij + 63) / 64;
         blocks.y = (bounds->npairs_kl + 255) / 256;
         md_j_4_0<<<blocks, threads, 7808*sizeof(double)>>>(*envs, *jk, *bounds); break;
+    */
     default: return 0;
     }
     return 1;