diff --git a/README.md b/README.md
index f396f73e..ec2b380a 100644
--- a/README.md
+++ b/README.md
@@ -50,8 +50,7 @@ Features
 - Dispersion corrections via [DFTD3](https://github.com/dftd3/simple-dftd3) and [DFTD4](https://github.com/dftd4/dftd4);
 - Nonlocal functional correction (vv10) for SCF and gradient;
 - ECP is supported and calculated on CPU;
-- PCM solvent models, analytical gradients, and semi-analytical Hessian matrix;
-- SMD solvent models and solvation free energy
+- PCM models, SMD model, their analytical gradients, and semi-analytical Hessian matrix;
 
 Limitations
 --------
diff --git a/benchmarks/smd/benchmark_smd.py b/benchmarks/smd/benchmark_smd.py
new file mode 100644
index 00000000..2b15baaa
--- /dev/null
+++ b/benchmarks/smd/benchmark_smd.py
@@ -0,0 +1,108 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import numpy
+from pyscf import gto
+from gpu4pyscf.solvent import smd
+from gpu4pyscf.solvent.grad import smd as smd_grad
+
+'''
+Benchmark CDS, gradient, hessian in the SMD model
+'''
+
+path = '../molecules/organic/'
+
+# calculated with qchem 6.1, in kcal/mol
+e_cds_qchem = {}
+e_cds_qchem['water'] = {
+    '020_Vitamin_C.xyz': 5.0737,
+    '031_Inosine.xyz': 2.7129,
+    '033_Bisphenol_A.xyz': 6.2620,
+    '037_Mg_Porphin.xyz': 6.0393,
+    '042_Penicillin_V.xyz': 6.4349,
+    '045_Ochratoxin_A.xyz': 8.8526,
+    '052_Cetirizine.xyz': 4.6430,
+    '057_Tamoxifen.xyz': 5.4743,
+    '066_Raffinose.xyz': 10.2543,
+    '084_Sphingomyelin.xyz': 15.0308,
+    '095_Azadirachtin.xyz': 16.9321,
+    '113_Taxol.xyz': 17.2585,
+    '168_Valinomycin.xyz': 27.3499,
+}
+
+e_cds_qchem['ethanol'] = {
+    '020_Vitamin_C.xyz': 4.2119,
+    '031_Inosine.xyz': 1.0175,
+    '033_Bisphenol_A.xyz': -0.2454,
+    '037_Mg_Porphin.xyz': -2.2391,
+    '042_Penicillin_V.xyz': 1.8338,
+    '045_Ochratoxin_A.xyz': 1.0592,
+    '052_Cetirizine.xyz': -2.5099,
+    '057_Tamoxifen.xyz': -3.9320,
+    '066_Raffinose.xyz': 3.1120,
+    '084_Sphingomyelin.xyz': -3.1963,
+    '095_Azadirachtin.xyz': 6.5286,
+    '113_Taxol.xyz': 2.7271,
+    '168_Valinomycin.xyz': 4.0013,
+}
+
+def _check_energy_grad(filename, solvent='water'):
+    xyz = os.path.join(path, filename)
+    mol = gto.Mole(atom=xyz)
+    mol.build()
+    natm = mol.natm
+    fd_cds = numpy.zeros([natm,3])
+    eps = 1e-4
+    for ia in range(mol.natm):
+        for j in range(3):
+            coords = mol.atom_coords(unit='B')
+            coords[ia,j] += eps
+            mol.set_geom_(coords, unit='B')
+            mol.build()
+
+            smdobj = smd.SMD(mol)
+            smdobj.solvent = solvent
+            e0_cds = smdobj.get_cds()
+
+            coords[ia,j] -= 2.0*eps
+            mol.set_geom_(coords, unit='B')
+            mol.build()
+
+            smdobj = smd.SMD(mol)
+            smdobj.solvent = solvent
+            e1_cds = smdobj.get_cds()
+
+            coords[ia,j] += eps
+            mol.set_geom_(coords, unit='B')
+            fd_cds[ia,j] = (e0_cds - e1_cds) / (2.0 * eps)
+
+    smdobj = smd.SMD(mol)
+    smdobj.solvent = solvent
+    e_cds = smd.get_cds(smdobj) * smd.hartree2kcal
+    grad_cds = smd_grad.get_cds(smdobj)
+    print(f'e_cds by GPU4PySCF: {e_cds}')
+    print(f'e_cds by Q-Chem: {e_cds_qchem[solvent][filename]}')
+    print(f'e_cds(Q-Chem) - e_cds(GPU4PySCF): {e_cds - e_cds_qchem[solvent][filename]}')
+    print(f'norm (fd gradient - analy gradient: {numpy.linalg.norm(fd_cds - grad_cds.get())}')
+    assert numpy.linalg.norm(fd_cds - grad_cds.get()) < 1e-8
+
+if __name__ == "__main__":
+    for filename in os.listdir(path):
+        print(f'---- benchmarking {filename} ----------')
+        print('in water')
+        _check_energy_grad(filename, solvent='water')
+        print('in ethanol')
+        _check_energy_grad(filename, solvent='ethanol')
\ No newline at end of file
diff --git a/examples/14-pcm_solvent.py b/examples/14-pcm_solvent.py
index 8eaabbf8..7fc040c7 100644
--- a/examples/14-pcm_solvent.py
+++ b/examples/14-pcm_solvent.py
@@ -33,7 +33,7 @@
 mf.grids.atom_grid = (99,590)
 mf.small_rho_cutoff = 1e-10
 mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids
-mf.with_solvent.method = 'C-PCM'
+mf.with_solvent.method = 'IEF-PCM'
 mf.with_solvent.eps = 78.3553
 mf.kernel()
 
diff --git a/examples/16-smd.py b/examples/16-smd.py
index 32c5802c..10e8ef63 100644
--- a/examples/16-smd.py
+++ b/examples/16-smd.py
@@ -24,11 +24,12 @@
 H      -0.7570000000    -0.0000000000    -0.4696000000
 H       0.7570000000     0.0000000000    -0.4696000000
 '''
-mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=4)
+atom = 'Vitamin_C.xyz'
+mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=6)
 
-mf = dft.rks.RKS(mol, xc='HYB_GGA_XC_B3LYP')#.density_fit()
+mf = dft.rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit()
 mf = mf.SMD()
-mf.verbose = 4
+mf.verbose = 6
 mf.grids.atom_grid = (99,590)
 mf.small_rho_cutoff = 1e-10
 mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids
@@ -36,3 +37,9 @@
 mf.with_solvent.solvent = 'water'
 e_tot = mf.kernel()
 print('total energy with SMD:', e_tot)
+
+gradobj = mf.nuc_grad_method()
+f = gradobj.kernel()
+
+hessobj = mf.Hessian()
+h = hessobj.kernel()
\ No newline at end of file
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index ea5e78a2..98dd2059 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -5,3 +5,8 @@
 # monkey patch libxc reference due to a bug in nvcc
 from pyscf.dft import libxc
 libxc.__reference__ = 'unable to decode the reference due to https://github.com/NVIDIA/cuda-python/issues/29'
+
+from gpu4pyscf.lib.utils import patch_cpu_kernel
+from gpu4pyscf.lib.cupy_helper import tag_array
+from pyscf import lib
+lib.tag_array = tag_array
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
index 42eea65c..332486ff 100644
--- a/gpu4pyscf/lib/cupy_helper.py
+++ b/gpu4pyscf/lib/cupy_helper.py
@@ -17,9 +17,11 @@
 
 import os
 import sys
+import functools
 import numpy as np
 import cupy
 import ctypes
+from pyscf import lib
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto import mole
 from gpu4pyscf.lib.cutensor import contract
@@ -93,11 +95,22 @@ def device2host_2d(a_cpu, a_gpu, stream=None):
 class CPArrayWithTag(cupy.ndarray):
     pass
 
+@functools.wraps(lib.tag_array)
 def tag_array(a, **kwargs):
-    ''' attach attributes to cupy ndarray'''
-    t = cupy.asarray(a).view(CPArrayWithTag)
-    if isinstance(a, CPArrayWithTag):
-        t.__dict__.update(a.__dict__)
+    '''
+    a should be cupy/numpy array or tuple of cupy/numpy array
+
+    attach attributes to cupy ndarray for cupy array
+    attach attributes to numpy ndarray for numpy array
+    '''
+    if isinstance(a, cupy.ndarray) or isinstance(a[0], cupy.ndarray):
+        t = cupy.asarray(a).view(CPArrayWithTag)
+        if isinstance(a, CPArrayWithTag):
+            t.__dict__.update(a.__dict__)
+    else:
+        t = np.asarray(a).view(lib.NPArrayWithTag)
+        if isinstance(a, lib.NPArrayWithTag):
+            t.__dict__.update(a.__dict__)
     t.__dict__.update(kwargs)
     return t
 
@@ -168,19 +181,23 @@ def add_sparse(a, b, indices):
         raise RuntimeError('failed in sparse_add2d')
     return a
 
-def dist_matrix(coords, out=None):
-    assert coords.flags.c_contiguous
-    n = coords.shape[0]
+def dist_matrix(x, y, out=None):
+    assert x.flags.c_contiguous
+    assert y.flags.c_contiguous
+
+    m = x.shape[0]
+    n = y.shape[0]
     if out is None:
-        out = cupy.empty([n,n])
+        out = cupy.empty([m,n])
 
     stream = cupy.cuda.get_current_stream()
     err = libcupy_helper.dist_matrix(
         ctypes.cast(stream.ptr, ctypes.c_void_p),
         ctypes.cast(out.data.ptr, ctypes.c_void_p),
-        ctypes.cast(coords.data.ptr, ctypes.c_void_p),
-        ctypes.cast(coords.data.ptr, ctypes.c_void_p),
-        ctypes.c_int(n),
+        ctypes.cast(x.data.ptr, ctypes.c_void_p),
+        ctypes.cast(y.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(m),
+        ctypes.c_int(n)
     )
     if err != 0:
         raise RuntimeError('failed in calculating distance matrix')
@@ -376,10 +393,8 @@ def cart2sph(t, axis=0, ang=1, out=None):
     t_cart = t.reshape([i0*nli, li_size[0], i3])
     if(out is not None):
         out = out.reshape([i0*nli, li_size[1], i3])
-        out[:] = cupy.einsum('min,ip->mpn', t_cart, c2s)
-    else:
-        out = cupy.einsum('min,ip->mpn', t_cart, c2s)
-    return out.reshape(out_shape)
+    t_sph = contract('min,ip->mpn', t_cart, c2s, out=out)
+    return t_sph.reshape(out_shape)
 
 # a copy with modification from
 # https://github.com/pyscf/pyscf/blob/9219058ac0a1bcdd8058166cad0fb9127b82e9bf/pyscf/lib/linalg_helper.py#L1536
diff --git a/gpu4pyscf/lib/cupy_helper/dist_matrix.cu b/gpu4pyscf/lib/cupy_helper/dist_matrix.cu
index 77f78de7..2b88db4f 100644
--- a/gpu4pyscf/lib/cupy_helper/dist_matrix.cu
+++ b/gpu4pyscf/lib/cupy_helper/dist_matrix.cu
@@ -19,11 +19,11 @@
 #define THREADS        32
 
 __global__
-static void _calc_distances(double *dist, const double *x, const double *y, int n)
+static void _calc_distances(double *dist, const double *x, const double *y, int m, int n)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     int j = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= n || j >= n){
+    if (i >= m || j >= n){
         return;
     }
 
@@ -34,12 +34,13 @@ static void _calc_distances(double *dist, const double *x, const double *y, int
 }
 
 extern "C" {
-int dist_matrix(cudaStream_t stream, double *dist, const double *x, const double *y, int n)
+int dist_matrix(cudaStream_t stream, double *dist, const double *x, const double *y, int m, int n)
 {
-    int ntile = (n + THREADS - 1) / THREADS;
+    int ntilex = (m + THREADS - 1) / THREADS;
+    int ntiley = (n + THREADS - 1) / THREADS;
     dim3 threads(THREADS, THREADS);
-    dim3 blocks(ntile, ntile);
-    _calc_distances<<<blocks, threads, 0, stream>>>(dist, x, y, n);
+    dim3 blocks(ntilex, ntiley);
+    _calc_distances<<<blocks, threads, 0, stream>>>(dist, x, y, m, n);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         return 1;
diff --git a/gpu4pyscf/lib/gvhf/g3c2e.cuh b/gpu4pyscf/lib/gvhf/g3c2e.cuh
index 9195cddd..4a06f1f7 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e.cuh
+++ b/gpu4pyscf/lib/gvhf/g3c2e.cuh
@@ -673,331 +673,4 @@ static void write_int3c2e_ip2_jk(JKMatrix jk, double *j3, double* k3, int ksh){
     }
 }
 
-__global__
-static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
-{
-    int ntasks_ij = offsets.ntasks_ij;
-    int ntasks_kl = offsets.ntasks_kl;
-    int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
-    int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
-    bool active = true;
-    if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
-        active = false;
-        task_ij = 0;
-        task_kl = 0;
-    }
-    int bas_ij = offsets.bas_ij + task_ij;
-    int bas_kl = offsets.bas_kl + task_kl;
-    double norm = envs.fac;
-    double omega = envs.omega;
-    int *bas_pair2bra = c_bpcache.bas_pair2bra;
-    int *bas_pair2ket = c_bpcache.bas_pair2ket;
-    int ish = bas_pair2bra[bas_ij];
-    int jsh = bas_pair2ket[bas_ij];
-    int ksh = bas_pair2bra[bas_kl];
-    int nprim_ij = envs.nprim_ij;
-    int nprim_kl = envs.nprim_kl;
-    int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
-    int prim_kl = offsets.primitive_kl + task_kl * nprim_kl;
-    double* __restrict__ a12 = c_bpcache.a12;
-    double* __restrict__ e12 = c_bpcache.e12;
-    double* __restrict__ x12 = c_bpcache.x12;
-    double* __restrict__ y12 = c_bpcache.y12;
-    double* __restrict__ z12 = c_bpcache.z12;
-    double* __restrict__ a1 = c_bpcache.a1;
-    int ij, kl;
-    int prim_ij0, prim_ij1, prim_kl0, prim_kl1;
-    int nbas = c_bpcache.nbas;
-    double* __restrict__ bas_x = c_bpcache.bas_coords;
-    double* __restrict__ bas_y = bas_x + nbas;
-    double* __restrict__ bas_z = bas_y + nbas;
-
-    double gout0 = 0;
-    double gout1 = 0;
-    double gout2 = 0;
-    double xi = bas_x[ish];
-    double yi = bas_y[ish];
-    double zi = bas_z[ish];
-    prim_ij0 = prim_ij;
-    prim_ij1 = prim_ij + nprim_ij;
-    prim_kl0 = prim_kl;
-    prim_kl1 = prim_kl + nprim_kl;
-    for (ij = prim_ij0; ij < prim_ij1; ++ij) {
-    for (kl = prim_kl0; kl < prim_kl1; ++kl) {
-        double ai2 = -2.0*a1[ij];
-        double aij = a12[ij];
-        double eij = e12[ij];
-        double xij = x12[ij];
-        double yij = y12[ij];
-        double zij = z12[ij];
-        double akl = a12[kl];
-        double ekl = e12[kl];
-        double xkl = x12[kl];
-        double ykl = y12[kl];
-        double zkl = z12[kl];
-        double xijxkl = xij - xkl;
-        double yijykl = yij - ykl;
-        double zijzkl = zij - zkl;
-        double aijkl = aij + akl;
-        double a1 = aij * akl;
-        double a0 = a1 / aijkl;
-        double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
-        a0 *= theta;
-        double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
-        double fac = eij * ekl * sqrt(a0 / (a1 * a1 * a1));
-        double root0, weight0;
-        if (x < 3.e-7) {
-            root0 = 0.5;
-            weight0 = 1.;
-        } else {
-            double tt = sqrt(x);
-            double fmt0 = SQRTPIE4 / tt * erf(tt);
-            weight0 = fmt0;
-            double e = exp(-x);
-            double b = .5 / x;
-            double fmt1 = b * (fmt0 - e);
-            root0 = fmt1 / (fmt0 - fmt1);
-        }
-        root0 /= root0 + 1 - root0 * theta;
-        double u2 = a0 * root0;
-        double tmp2 = akl * u2 / (u2 * aijkl + a1);;
-        double c00x = xij - xi - tmp2 * xijxkl;
-        double c00y = yij - yi - tmp2 * yijykl;
-        double c00z = zij - zi - tmp2 * zijzkl;
-        double g_0 = 1;
-        double g_1 = c00x;
-        double g_2 = 1;
-        double g_3 = c00y;
-        double g_4 = norm * fac * weight0;
-        double g_5 = g_4 * c00z;
-
-        double f_1 = ai2 * g_1;
-        double f_3 = ai2 * g_3;
-        double f_5 = ai2 * g_5;
-
-        gout0 += f_1 * g_2 * g_4;
-        gout1 += g_0 * f_3 * g_4;
-        gout2 += g_0 * g_2 * f_5;
-    } }
 
-    int *ao_loc = c_bpcache.ao_loc;
-    int i0 = ao_loc[ish] - jk.ao_offsets_i;
-    int j0 = ao_loc[jsh] - jk.ao_offsets_j;
-    int k0 = ao_loc[ksh] - jk.ao_offsets_k;
-
-    int nao = jk.nao;
-    double* __restrict__ dm = jk.dm;
-    double* __restrict__ rhok = jk.rhok;
-    double* __restrict__ rhoj = jk.rhoj;
-    double* __restrict__ vj = jk.vj;
-    double* __restrict__ vk = jk.vk;
-
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    __shared__ double sdata[THREADSX][THREADSY];
-    if (!active){
-        gout0 = 0.0; gout1 = 0.0; gout2 = 0.0;
-    }
-    if (vj != NULL){
-        double rhoj_tmp;
-        int off_dm = i0 + nao*j0;
-        rhoj_tmp = dm[off_dm] * rhoj[k0];
-        double vj_tmp[3];
-        vj_tmp[0] = gout0*rhoj_tmp;
-        vj_tmp[1] = gout1*rhoj_tmp;
-        vj_tmp[2] = gout2*rhoj_tmp;
-        for (int j = 0; j < 3; j++){
-            sdata[tx][ty] = vj_tmp[j]; __syncthreads();
-            if(ty<8) sdata[tx][ty] += sdata[tx][ty+8]; __syncthreads();
-            if(ty<4) sdata[tx][ty] += sdata[tx][ty+4]; __syncthreads();
-            if(ty<2) sdata[tx][ty] += sdata[tx][ty+2]; __syncthreads();
-            if(ty<1) sdata[tx][ty] += sdata[tx][ty+1]; __syncthreads();
-            if (ty == 0) atomicAdd(vj+i0+j*nao, sdata[tx][0]);
-        }
-    }
-    if (vk != NULL){
-        double rhok_tmp;
-        int off_rhok = i0 + nao*j0 + k0*nao*nao;
-        rhok_tmp = rhok[off_rhok];
-        double vk_tmp[3];
-        vk_tmp[0] = gout0 * rhok_tmp;
-        vk_tmp[1] = gout1 * rhok_tmp;
-        vk_tmp[2] = gout2 * rhok_tmp;
-        for (int j = 0; j < 3; j++){
-            sdata[tx][ty] = vk_tmp[j]; __syncthreads();
-            if(ty<8) sdata[tx][ty] += sdata[tx][ty+8]; __syncthreads();
-            if(ty<4) sdata[tx][ty] += sdata[tx][ty+4]; __syncthreads();
-            if(ty<2) sdata[tx][ty] += sdata[tx][ty+2]; __syncthreads();
-            if(ty<1) sdata[tx][ty] += sdata[tx][ty+1]; __syncthreads();
-            if (ty == 0) atomicAdd(vk+i0+j*nao, sdata[tx][0]);
-        }
-    }
-}
-
-
-__global__
-static void GINTrun_int3c2e_ip2_jk_kernel0010(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
-{
-    int ntasks_ij = offsets.ntasks_ij;
-    int ntasks_kl = offsets.ntasks_kl;
-    int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
-    int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
-    bool active = true;
-    if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
-        active = false;
-        task_ij = 0;
-        task_kl = 0;
-    }
-    int bas_ij = offsets.bas_ij + task_ij;
-    int bas_kl = offsets.bas_kl + task_kl;
-    double norm = envs.fac;
-    double omega = envs.omega;
-    int *bas_pair2bra = c_bpcache.bas_pair2bra;
-    int *bas_pair2ket = c_bpcache.bas_pair2ket;
-    int ish = bas_pair2bra[bas_ij];
-    int jsh = bas_pair2ket[bas_ij];
-    int ksh = bas_pair2bra[bas_kl];
-    int nprim_ij = envs.nprim_ij;
-    int nprim_kl = envs.nprim_kl;
-    int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
-    int prim_kl = offsets.primitive_kl + task_kl * nprim_kl;
-    double* __restrict__ a12 = c_bpcache.a12;
-    double* __restrict__ e12 = c_bpcache.e12;
-    double* __restrict__ x12 = c_bpcache.x12;
-    double* __restrict__ y12 = c_bpcache.y12;
-    double* __restrict__ z12 = c_bpcache.z12;
-    double* __restrict__ a1 = c_bpcache.a1;
-    int ij, kl;
-    int prim_ij0, prim_ij1, prim_kl0, prim_kl1;
-    int nbas = c_bpcache.nbas;
-    double* __restrict__ bas_x = c_bpcache.bas_coords;
-    double* __restrict__ bas_y = bas_x + nbas;
-    double* __restrict__ bas_z = bas_y + nbas;
-
-    double gout0 = 0;
-    double gout1 = 0;
-    double gout2 = 0;
-    double xk = bas_x[ksh];
-    double yk = bas_y[ksh];
-    double zk = bas_z[ksh];
-    prim_ij0 = prim_ij;
-    prim_ij1 = prim_ij + nprim_ij;
-    prim_kl0 = prim_kl;
-    prim_kl1 = prim_kl + nprim_kl;
-    for (ij = prim_ij0; ij < prim_ij1; ++ij) {
-    for (kl = prim_kl0; kl < prim_kl1; ++kl) {
-        double ak2 = -2.0*a1[kl];
-        double aij = a12[ij];
-        double eij = e12[ij];
-        double xij = x12[ij];
-        double yij = y12[ij];
-        double zij = z12[ij];
-        double akl = a12[kl];
-        double ekl = e12[kl];
-        double xkl = x12[kl];
-        double ykl = y12[kl];
-        double zkl = z12[kl];
-        double xijxkl = xij - xkl;
-        double yijykl = yij - ykl;
-        double zijzkl = zij - zkl;
-        double aijkl = aij + akl;
-        double a1 = aij * akl;
-        double a0 = a1 / aijkl;
-        double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
-        a0 *= theta;
-        double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
-        double fac = norm * eij * ekl * sqrt(a0 / (a1 * a1 * a1));
-        double root0, weight0;
-        if (x < 3.e-7) {
-            root0 = 0.5;
-            weight0 = 1.;
-        } else {
-            double tt = sqrt(x);
-            double fmt0 = SQRTPIE4 / tt * erf(tt);
-            weight0 = fmt0;
-            double e = exp(-x);
-            double b = .5 / x;
-            double fmt1 = b * (fmt0 - e);
-            root0 = fmt1 / (fmt0 - fmt1);
-        }
-        root0 /= root0 + 1 - root0 * theta;
-        double u2 = a0 * root0;
-        double tmp4 = .5 / (u2 * aijkl + a1);
-        double b00 = u2 * tmp4;
-        double tmp1 = 2 * b00;
-        double tmp3 = tmp1 * aij;
-        double c0px = xkl - xk + tmp3 * xijxkl;
-        double c0py = ykl - yk + tmp3 * yijykl;
-        double c0pz = zkl - zk + tmp3 * zijzkl;
-        double g_0 = 1;
-        double g_1 = c0px;
-        double g_2 = 1;
-        double g_3 = c0py;
-        double g_4 = weight0 * fac;
-        double g_5 = c0pz * g_4;
-
-        double f_1 = ak2 * g_1;
-        double f_3 = ak2 * g_3;
-        double f_5 = ak2 * g_5;
-
-        gout0 += f_1 * g_2 * g_4;
-        gout1 += g_0 * f_3 * g_4;
-        gout2 += g_0 * g_2 * f_5;
-
-    } }
-
-    int *ao_loc = c_bpcache.ao_loc;
-    int i0 = ao_loc[ish] - jk.ao_offsets_i;
-    int j0 = ao_loc[jsh] - jk.ao_offsets_j;
-    int k0 = ao_loc[ksh] - jk.ao_offsets_k;
-
-    int nao = jk.nao;
-    int naux = jk.naux;
-    double* __restrict__ dm = jk.dm;
-    double* __restrict__ rhok = jk.rhok;
-    double* __restrict__ rhoj = jk.rhoj;
-    double* __restrict__ vj = jk.vj;
-    double* __restrict__ vk = jk.vk;
-
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    __shared__ double sdata[THREADSX][THREADSY];
-    if (!active){
-        gout0 = 0.0; gout1 = 0.0; gout2 = 0.0;
-    }
-    if (vj != NULL){
-        double rhoj_tmp;
-        int off_dm = i0 + nao*j0;
-        rhoj_tmp = dm[off_dm] * rhoj[k0];
-        double vj_tmp[3];
-        vj_tmp[0] = gout0 * rhoj_tmp;
-        vj_tmp[1] = gout1 * rhoj_tmp;
-        vj_tmp[2] = gout2 * rhoj_tmp;
-        for (int j = 0; j < 3; j++){
-            sdata[tx][ty] = vj_tmp[j]; __syncthreads();
-            if(tx<8) sdata[tx][ty] += sdata[tx+8][ty]; __syncthreads();
-            if(tx<4) sdata[tx][ty] += sdata[tx+4][ty]; __syncthreads();
-            if(tx<2) sdata[tx][ty] += sdata[tx+2][ty]; __syncthreads();
-            if(tx<1) sdata[tx][ty] += sdata[tx+1][ty]; __syncthreads();
-            if (tx == 0) atomicAdd(vj+k0+j*naux, sdata[0][ty]);
-        }
-    }
-
-    if (vk != NULL){
-        double rhok_tmp;
-        int off_rhok = i0 + nao*j0 + k0*nao*nao;
-        rhok_tmp = rhok[off_rhok];
-        double vk_tmp[3];
-        vk_tmp[0] = gout0 * rhok_tmp;
-        vk_tmp[1] = gout1 * rhok_tmp;
-        vk_tmp[2] = gout2 * rhok_tmp;
-        for (int j = 0; j < 3; j++){
-            sdata[tx][ty] = vk_tmp[j]; __syncthreads();
-            if(tx<8) sdata[tx][ty] += sdata[tx+8][ty]; __syncthreads();
-            if(tx<4) sdata[tx][ty] += sdata[tx+4][ty]; __syncthreads();
-            if(tx<2) sdata[tx][ty] += sdata[tx+2][ty]; __syncthreads();
-            if(tx<1) sdata[tx][ty] += sdata[tx+1][ty]; __syncthreads();
-            if (tx == 0) atomicAdd(vk+k0+j*naux, sdata[0][ty]);
-        }
-    }
-}
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_ip1.cu b/gpu4pyscf/lib/gvhf/g3c2e_ip1.cu
index cd20ac66..3c0c88fc 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_ip1.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_ip1.cu
@@ -13,7 +13,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
- 
+
 
 template <int NROOTS, int GSIZE> __global__
 void GINTint3c2e_ip1_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
@@ -47,7 +47,7 @@ void GINTint3c2e_ip1_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets o
     double uw[NROOTS*2];
     double g[2*GSIZE];
     double *f = g + GSIZE;
-    
+
     double* __restrict__ a12 = c_bpcache.a12;
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
@@ -69,7 +69,7 @@ void GINTint3c2e_ip1_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets o
         as_ksh = lsh;
         as_lsh = ksh;
     }
-    
+
     double j3[GPU_CART_MAX * 3];
     double k3[GPU_CART_MAX * 3];
     for (int k = 0; k < GPU_CART_MAX * 3; k++){
@@ -79,7 +79,7 @@ void GINTint3c2e_ip1_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets o
     if (active) {
         for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
             for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
-                            
+
             double aij = a12[ij];
             double xij = x12[ij];
             double yij = y12[ij];
@@ -94,19 +94,180 @@ void GINTint3c2e_ip1_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets o
             double aijkl = aij + akl;
             double a1 = aij * akl;
             double a0 = a1 / aijkl;
-            double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; 
+            double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
             a0 *= theta;
             double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
             GINTrys_root<NROOTS>(x, uw);
             GINTscale_u<NROOTS>(uw, theta);
             GINTg0_2e_2d4d<NROOTS>(envs, g, uw, norm, as_ish, as_jsh, as_ksh, as_lsh, ij, kl);
-                
+
             double ai2 = -2.0*exp[ij];
             GINTnabla1i_2e<NROOTS>(envs, f, g, ai2, envs.i_l, envs.j_l, envs.k_l);
             GINTkernel_int3c2e_ip1_getjk_direct<NROOTS>(envs, jk, j3, k3, f, g, ish, jsh, ksh);
-            } 
+            }
         }
     }
-    
+
     write_int3c2e_ip1_jk(jk, j3, k3, ish);
 }
+
+__global__
+static void GINTrun_int3c2e_ip1_jk_kernel1000(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
+{
+    int ntasks_ij = offsets.ntasks_ij;
+    int ntasks_kl = offsets.ntasks_kl;
+    int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
+    bool active = true;
+    if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
+        active = false;
+        task_ij = 0;
+        task_kl = 0;
+    }
+    int bas_ij = offsets.bas_ij + task_ij;
+    int bas_kl = offsets.bas_kl + task_kl;
+    double norm = envs.fac;
+    double omega = envs.omega;
+    int *bas_pair2bra = c_bpcache.bas_pair2bra;
+    int *bas_pair2ket = c_bpcache.bas_pair2ket;
+    int ish = bas_pair2bra[bas_ij];
+    int jsh = bas_pair2ket[bas_ij];
+    int ksh = bas_pair2bra[bas_kl];
+    int nprim_ij = envs.nprim_ij;
+    int nprim_kl = envs.nprim_kl;
+    int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    int prim_kl = offsets.primitive_kl + task_kl * nprim_kl;
+    double* __restrict__ a12 = c_bpcache.a12;
+    double* __restrict__ e12 = c_bpcache.e12;
+    double* __restrict__ x12 = c_bpcache.x12;
+    double* __restrict__ y12 = c_bpcache.y12;
+    double* __restrict__ z12 = c_bpcache.z12;
+    double* __restrict__ a1 = c_bpcache.a1;
+    int ij, kl;
+    int prim_ij0, prim_ij1, prim_kl0, prim_kl1;
+    int nbas = c_bpcache.nbas;
+    double* __restrict__ bas_x = c_bpcache.bas_coords;
+    double* __restrict__ bas_y = bas_x + nbas;
+    double* __restrict__ bas_z = bas_y + nbas;
+
+    double gout0 = 0;
+    double gout1 = 0;
+    double gout2 = 0;
+    double xi = bas_x[ish];
+    double yi = bas_y[ish];
+    double zi = bas_z[ish];
+    prim_ij0 = prim_ij;
+    prim_ij1 = prim_ij + nprim_ij;
+    prim_kl0 = prim_kl;
+    prim_kl1 = prim_kl + nprim_kl;
+    for (ij = prim_ij0; ij < prim_ij1; ++ij) {
+    for (kl = prim_kl0; kl < prim_kl1; ++kl) {
+        double ai2 = -2.0*a1[ij];
+        double aij = a12[ij];
+        double eij = e12[ij];
+        double xij = x12[ij];
+        double yij = y12[ij];
+        double zij = z12[ij];
+        double akl = a12[kl];
+        double ekl = e12[kl];
+        double xkl = x12[kl];
+        double ykl = y12[kl];
+        double zkl = z12[kl];
+        double xijxkl = xij - xkl;
+        double yijykl = yij - ykl;
+        double zijzkl = zij - zkl;
+        double aijkl = aij + akl;
+        double a1 = aij * akl;
+        double a0 = a1 / aijkl;
+        double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+        a0 *= theta;
+        double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
+        double fac = eij * ekl * sqrt(a0 / (a1 * a1 * a1));
+        double root0, weight0;
+        if (x < 3.e-7) {
+            root0 = 0.5;
+            weight0 = 1.;
+        } else {
+            double tt = sqrt(x);
+            double fmt0 = SQRTPIE4 / tt * erf(tt);
+            weight0 = fmt0;
+            double e = exp(-x);
+            double b = .5 / x;
+            double fmt1 = b * (fmt0 - e);
+            root0 = fmt1 / (fmt0 - fmt1);
+        }
+        root0 /= root0 + 1 - root0 * theta;
+        double u2 = a0 * root0;
+        double tmp2 = akl * u2 / (u2 * aijkl + a1);;
+        double c00x = xij - xi - tmp2 * xijxkl;
+        double c00y = yij - yi - tmp2 * yijykl;
+        double c00z = zij - zi - tmp2 * zijzkl;
+        double g_0 = 1;
+        double g_1 = c00x;
+        double g_2 = 1;
+        double g_3 = c00y;
+        double g_4 = norm * fac * weight0;
+        double g_5 = g_4 * c00z;
+
+        double f_1 = ai2 * g_1;
+        double f_3 = ai2 * g_3;
+        double f_5 = ai2 * g_5;
+
+        gout0 += f_1 * g_2 * g_4;
+        gout1 += g_0 * f_3 * g_4;
+        gout2 += g_0 * g_2 * f_5;
+    } }
+
+    int *ao_loc = c_bpcache.ao_loc;
+    int i0 = ao_loc[ish] - jk.ao_offsets_i;
+    int j0 = ao_loc[jsh] - jk.ao_offsets_j;
+    int k0 = ao_loc[ksh] - jk.ao_offsets_k;
+
+    int nao = jk.nao;
+    double* __restrict__ dm = jk.dm;
+    double* __restrict__ rhok = jk.rhok;
+    double* __restrict__ rhoj = jk.rhoj;
+    double* __restrict__ vj = jk.vj;
+    double* __restrict__ vk = jk.vk;
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    __shared__ double sdata[THREADSX][THREADSY];
+    if (!active){
+        gout0 = 0.0; gout1 = 0.0; gout2 = 0.0;
+    }
+    if (vj != NULL){
+        double rhoj_tmp;
+        int off_dm = i0 + nao*j0;
+        rhoj_tmp = dm[off_dm] * rhoj[k0];
+        double vj_tmp[3];
+        vj_tmp[0] = gout0*rhoj_tmp;
+        vj_tmp[1] = gout1*rhoj_tmp;
+        vj_tmp[2] = gout2*rhoj_tmp;
+        for (int j = 0; j < 3; j++){
+            sdata[tx][ty] = vj_tmp[j]; __syncthreads();
+            if(ty<8) sdata[tx][ty] += sdata[tx][ty+8]; __syncthreads();
+            if(ty<4) sdata[tx][ty] += sdata[tx][ty+4]; __syncthreads();
+            if(ty<2) sdata[tx][ty] += sdata[tx][ty+2]; __syncthreads();
+            if(ty<1) sdata[tx][ty] += sdata[tx][ty+1]; __syncthreads();
+            if (ty == 0) atomicAdd(vj+i0+j*nao, sdata[tx][0]);
+        }
+    }
+    if (vk != NULL){
+        double rhok_tmp;
+        int off_rhok = i0 + nao*j0 + k0*nao*nao;
+        rhok_tmp = rhok[off_rhok];
+        double vk_tmp[3];
+        vk_tmp[0] = gout0 * rhok_tmp;
+        vk_tmp[1] = gout1 * rhok_tmp;
+        vk_tmp[2] = gout2 * rhok_tmp;
+        for (int j = 0; j < 3; j++){
+            sdata[tx][ty] = vk_tmp[j]; __syncthreads();
+            if(ty<8) sdata[tx][ty] += sdata[tx][ty+8]; __syncthreads();
+            if(ty<4) sdata[tx][ty] += sdata[tx][ty+4]; __syncthreads();
+            if(ty<2) sdata[tx][ty] += sdata[tx][ty+2]; __syncthreads();
+            if(ty<1) sdata[tx][ty] += sdata[tx][ty+1]; __syncthreads();
+            if (ty == 0) atomicAdd(vk+i0+j*nao, sdata[tx][0]);
+        }
+    }
+}
diff --git a/gpu4pyscf/lib/gvhf/g3c2e_ip2.cu b/gpu4pyscf/lib/gvhf/g3c2e_ip2.cu
index e2a4cdbc..d83f3608 100644
--- a/gpu4pyscf/lib/gvhf/g3c2e_ip2.cu
+++ b/gpu4pyscf/lib/gvhf/g3c2e_ip2.cu
@@ -13,7 +13,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
- 
+
 
 template <int NROOTS, int GSIZE> __global__
 void GINTint3c2e_ip2_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
@@ -47,7 +47,7 @@ void GINTint3c2e_ip2_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets o
     double uw[NROOTS*2];
     double g[2*GSIZE];
     double *f = g + GSIZE;
-    
+
     double* __restrict__ a12 = c_bpcache.a12;
     double* __restrict__ x12 = c_bpcache.x12;
     double* __restrict__ y12 = c_bpcache.y12;
@@ -69,7 +69,7 @@ void GINTint3c2e_ip2_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets o
         as_ksh = lsh;
         as_lsh = ksh;
     }
-    
+
     double j3[GPU_CART_MAX * 3];
     double k3[GPU_CART_MAX * 3];
     for (int k = 0; k < GPU_CART_MAX * 3; k++){
@@ -79,7 +79,7 @@ void GINTint3c2e_ip2_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets o
     if (active) {
         for (ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) {
             for (kl = prim_kl; kl < prim_kl+nprim_kl; ++kl) {
-                            
+
             double aij = a12[ij];
             double xij = x12[ij];
             double yij = y12[ij];
@@ -94,19 +94,187 @@ void GINTint3c2e_ip2_jk_kernel(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets o
             double aijkl = aij + akl;
             double a1 = aij * akl;
             double a0 = a1 / aijkl;
-            double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; 
+            double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
             a0 *= theta;
             double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
             GINTrys_root<NROOTS>(x, uw);
             GINTscale_u<NROOTS>(uw, theta);
             GINTg0_2e_2d4d<NROOTS>(envs, g, uw, norm, as_ish, as_jsh, as_ksh, as_lsh, ij, kl);
-                
+
             double ak2 = -2.0*exp[kl];
             GINTnabla1k_2e<NROOTS>(envs, f, g, ak2, envs.i_l, envs.j_l, envs.k_l);
             GINTkernel_int3c2e_ip2_getjk_direct<NROOTS>(envs, jk, j3, k3, f, g, ish, jsh, ksh);
-            } 
+            }
         }
     }
-    
+
     write_int3c2e_ip2_jk(jk, j3, k3, ksh);
 }
+
+
+__global__
+static void GINTrun_int3c2e_ip2_jk_kernel0010(GINTEnvVars envs, JKMatrix jk, BasisProdOffsets offsets)
+{
+    int ntasks_ij = offsets.ntasks_ij;
+    int ntasks_kl = offsets.ntasks_kl;
+    int task_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    int task_kl = blockIdx.y * blockDim.y + threadIdx.y;
+    bool active = true;
+    if (task_ij >= ntasks_ij || task_kl >= ntasks_kl) {
+        active = false;
+        task_ij = 0;
+        task_kl = 0;
+    }
+    int bas_ij = offsets.bas_ij + task_ij;
+    int bas_kl = offsets.bas_kl + task_kl;
+    double norm = envs.fac;
+    double omega = envs.omega;
+    int *bas_pair2bra = c_bpcache.bas_pair2bra;
+    int *bas_pair2ket = c_bpcache.bas_pair2ket;
+    int ish = bas_pair2bra[bas_ij];
+    int jsh = bas_pair2ket[bas_ij];
+    int ksh = bas_pair2bra[bas_kl];
+    int nprim_ij = envs.nprim_ij;
+    int nprim_kl = envs.nprim_kl;
+    int prim_ij = offsets.primitive_ij + task_ij * nprim_ij;
+    int prim_kl = offsets.primitive_kl + task_kl * nprim_kl;
+    double* __restrict__ a12 = c_bpcache.a12;
+    double* __restrict__ e12 = c_bpcache.e12;
+    double* __restrict__ x12 = c_bpcache.x12;
+    double* __restrict__ y12 = c_bpcache.y12;
+    double* __restrict__ z12 = c_bpcache.z12;
+    double* __restrict__ a1 = c_bpcache.a1;
+    int ij, kl;
+    int prim_ij0, prim_ij1, prim_kl0, prim_kl1;
+    int nbas = c_bpcache.nbas;
+    double* __restrict__ bas_x = c_bpcache.bas_coords;
+    double* __restrict__ bas_y = bas_x + nbas;
+    double* __restrict__ bas_z = bas_y + nbas;
+
+    double gout0 = 0;
+    double gout1 = 0;
+    double gout2 = 0;
+    double xk = bas_x[ksh];
+    double yk = bas_y[ksh];
+    double zk = bas_z[ksh];
+    prim_ij0 = prim_ij;
+    prim_ij1 = prim_ij + nprim_ij;
+    prim_kl0 = prim_kl;
+    prim_kl1 = prim_kl + nprim_kl;
+    for (ij = prim_ij0; ij < prim_ij1; ++ij) {
+    for (kl = prim_kl0; kl < prim_kl1; ++kl) {
+        double ak2 = -2.0*a1[kl];
+        double aij = a12[ij];
+        double eij = e12[ij];
+        double xij = x12[ij];
+        double yij = y12[ij];
+        double zij = z12[ij];
+        double akl = a12[kl];
+        double ekl = e12[kl];
+        double xkl = x12[kl];
+        double ykl = y12[kl];
+        double zkl = z12[kl];
+        double xijxkl = xij - xkl;
+        double yijykl = yij - ykl;
+        double zijzkl = zij - zkl;
+        double aijkl = aij + akl;
+        double a1 = aij * akl;
+        double a0 = a1 / aijkl;
+        double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0;
+        a0 *= theta;
+        double x = a0 * (xijxkl * xijxkl + yijykl * yijykl + zijzkl * zijzkl);
+        double fac = norm * eij * ekl * sqrt(a0 / (a1 * a1 * a1));
+        double root0, weight0;
+        if (x < 3.e-7) {
+            root0 = 0.5;
+            weight0 = 1.;
+        } else {
+            double tt = sqrt(x);
+            double fmt0 = SQRTPIE4 / tt * erf(tt);
+            weight0 = fmt0;
+            double e = exp(-x);
+            double b = .5 / x;
+            double fmt1 = b * (fmt0 - e);
+            root0 = fmt1 / (fmt0 - fmt1);
+        }
+        root0 /= root0 + 1 - root0 * theta;
+        double u2 = a0 * root0;
+        double tmp4 = .5 / (u2 * aijkl + a1);
+        double b00 = u2 * tmp4;
+        double tmp1 = 2 * b00;
+        double tmp3 = tmp1 * aij;
+        double c0px = xkl - xk + tmp3 * xijxkl;
+        double c0py = ykl - yk + tmp3 * yijykl;
+        double c0pz = zkl - zk + tmp3 * zijzkl;
+        double g_0 = 1;
+        double g_1 = c0px;
+        double g_2 = 1;
+        double g_3 = c0py;
+        double g_4 = weight0 * fac;
+        double g_5 = c0pz * g_4;
+
+        double f_1 = ak2 * g_1;
+        double f_3 = ak2 * g_3;
+        double f_5 = ak2 * g_5;
+
+        gout0 += f_1 * g_2 * g_4;
+        gout1 += g_0 * f_3 * g_4;
+        gout2 += g_0 * g_2 * f_5;
+
+    } }
+
+    int *ao_loc = c_bpcache.ao_loc;
+    int i0 = ao_loc[ish] - jk.ao_offsets_i;
+    int j0 = ao_loc[jsh] - jk.ao_offsets_j;
+    int k0 = ao_loc[ksh] - jk.ao_offsets_k;
+
+    int nao = jk.nao;
+    int naux = jk.naux;
+    double* __restrict__ dm = jk.dm;
+    double* __restrict__ rhok = jk.rhok;
+    double* __restrict__ rhoj = jk.rhoj;
+    double* __restrict__ vj = jk.vj;
+    double* __restrict__ vk = jk.vk;
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    __shared__ double sdata[THREADSX][THREADSY];
+    if (!active){
+        gout0 = 0.0; gout1 = 0.0; gout2 = 0.0;
+    }
+    if (vj != NULL){
+        double rhoj_tmp;
+        int off_dm = i0 + nao*j0;
+        rhoj_tmp = dm[off_dm] * rhoj[k0];
+        double vj_tmp[3];
+        vj_tmp[0] = gout0 * rhoj_tmp;
+        vj_tmp[1] = gout1 * rhoj_tmp;
+        vj_tmp[2] = gout2 * rhoj_tmp;
+        for (int j = 0; j < 3; j++){
+            sdata[tx][ty] = vj_tmp[j]; __syncthreads();
+            if(tx<8) sdata[tx][ty] += sdata[tx+8][ty]; __syncthreads();
+            if(tx<4) sdata[tx][ty] += sdata[tx+4][ty]; __syncthreads();
+            if(tx<2) sdata[tx][ty] += sdata[tx+2][ty]; __syncthreads();
+            if(tx<1) sdata[tx][ty] += sdata[tx+1][ty]; __syncthreads();
+            if (tx == 0) atomicAdd(vj+k0+j*naux, sdata[0][ty]);
+        }
+    }
+
+    if (vk != NULL){
+        double rhok_tmp;
+        int off_rhok = i0 + nao*j0 + k0*nao*nao;
+        rhok_tmp = rhok[off_rhok];
+        double vk_tmp[3];
+        vk_tmp[0] = gout0 * rhok_tmp;
+        vk_tmp[1] = gout1 * rhok_tmp;
+        vk_tmp[2] = gout2 * rhok_tmp;
+        for (int j = 0; j < 3; j++){
+            sdata[tx][ty] = vk_tmp[j]; __syncthreads();
+            if(tx<8) sdata[tx][ty] += sdata[tx+8][ty]; __syncthreads();
+            if(tx<4) sdata[tx][ty] += sdata[tx+4][ty]; __syncthreads();
+            if(tx<2) sdata[tx][ty] += sdata[tx+2][ty]; __syncthreads();
+            if(tx<1) sdata[tx][ty] += sdata[tx+1][ty]; __syncthreads();
+            if (tx == 0) atomicAdd(vk+k0+j*naux, sdata[0][ty]);
+        }
+    }
+}
\ No newline at end of file
diff --git a/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_ip1.cu b/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_ip1.cu
index 9174520d..5b7faf14 100644
--- a/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_ip1.cu
+++ b/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_ip1.cu
@@ -68,17 +68,17 @@ static int GINTrun_tasks_int3c2e_ip1_jk(JKMatrix *jk, BasisProdOffsets *offsets,
 
 extern "C" { __host__
 int GINTbuild_int3c2e_ip1_jk(BasisProdCache *bpcache,
-                 double *vj, double *vk, double *dm, double *rhoj, double *rhok, 
+                 double *vj, double *vk, double *dm, double *rhoj, double *rhok,
                  int *ao_offsets, int nao, int naux, int n_dm,
                  int *bins_locs_ij, int ntasks_kl, int ncp_ij, int cp_kl_id, double omega)
 {
     ContractionProdType *cp_kl = bpcache->cptype + cp_kl_id;
-    
+
     int ng[4] = {1,0,0,0};
-    
+
     // move bpcache to constant memory
     checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
-    
+
     JKMatrix jk;
     jk.n_dm = n_dm;
     jk.nao = nao;
@@ -92,8 +92,7 @@ int GINTbuild_int3c2e_ip1_jk(BasisProdCache *bpcache,
     jk.ao_offsets_j = ao_offsets[1];
     jk.ao_offsets_k = ao_offsets[2];
     jk.ao_offsets_l = ao_offsets[3];
-    BasisProdOffsets offsets;
-    
+
     int *bas_pairs_locs = bpcache->bas_pairs_locs;
     int *primitive_pairs_locs = bpcache->primitive_pairs_locs;
 
@@ -101,9 +100,9 @@ int GINTbuild_int3c2e_ip1_jk(BasisProdCache *bpcache,
     for (int n = 0; n < MAX_STREAMS; n++){
         checkCudaErrors(cudaStreamCreate(&streams[n]));
     }
-    
+
     int *idx = (int *)malloc(sizeof(int) * TOT_NF * 3);
-    int *l_locs = (int *)malloc(sizeof(int) * (GPU_LMAX + 2)); 
+    int *l_locs = (int *)malloc(sizeof(int) * (GPU_LMAX + 2));
     GINTinit_index1d_xyz(idx, l_locs);
     checkCudaErrors(cudaMemcpyToSymbol(c_idx, idx, sizeof(int) * TOT_NF*3));
     checkCudaErrors(cudaMemcpyToSymbol(c_l_locs, l_locs, sizeof(int) * (GPU_LMAX + 2)));
@@ -122,7 +121,8 @@ int GINTbuild_int3c2e_ip1_jk(BasisProdCache *bpcache,
 
         int ntasks_ij = bins_locs_ij[cp_ij_id+1] - bins_locs_ij[cp_ij_id];
         if (ntasks_ij <= 0) continue;
-        
+
+        BasisProdOffsets offsets;
         offsets.ntasks_ij = ntasks_ij;
         offsets.ntasks_kl = ntasks_kl;
         offsets.bas_ij = bas_pairs_locs[cp_ij_id];
@@ -140,7 +140,7 @@ int GINTbuild_int3c2e_ip1_jk(BasisProdCache *bpcache,
         checkCudaErrors(cudaStreamSynchronize(streams[n]));
         checkCudaErrors(cudaStreamDestroy(streams[n]));
     }
-    
+
     return 0;
 }
 
diff --git a/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_ip2.cu b/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_ip2.cu
index 7adefe75..cfc45bfa 100644
--- a/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_ip2.cu
+++ b/gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_ip2.cu
@@ -67,17 +67,17 @@ static int GINTrun_tasks_int3c2e_ip2_jk(JKMatrix *jk, BasisProdOffsets *offsets,
 
 extern "C" { __host__
 int GINTbuild_int3c2e_ip2_jk(BasisProdCache *bpcache,
-                 double *vj, double *vk, double *dm, double *rhoj, double *rhok, 
+                 double *vj, double *vk, double *dm, double *rhoj, double *rhok,
                  int *ao_offsets, int nao, int naux, int n_dm,
                  int *bins_locs_ij, int ntasks_kl, int ncp_ij, int cp_kl_id, double omega)
 {
     ContractionProdType *cp_kl = bpcache->cptype + cp_kl_id;
-    
+
     int ng[4] = {0,0,1,0};
-    
+
     // move bpcache to constant memory
     checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache)));
-    
+
     JKMatrix jk;
     jk.n_dm = n_dm;
     jk.nao = nao;
@@ -91,18 +91,17 @@ int GINTbuild_int3c2e_ip2_jk(BasisProdCache *bpcache,
     jk.ao_offsets_j = ao_offsets[1];
     jk.ao_offsets_k = ao_offsets[2];
     jk.ao_offsets_l = ao_offsets[3];
-    BasisProdOffsets offsets;
-    
+
     int *bas_pairs_locs = bpcache->bas_pairs_locs;
     int *primitive_pairs_locs = bpcache->primitive_pairs_locs;
-    
+
     cudaStream_t streams[MAX_STREAMS];
     for (int n = 0; n < MAX_STREAMS; n++){
         checkCudaErrors(cudaStreamCreate(&streams[n]));
     }
-    
+
     int *idx = (int *)malloc(sizeof(int) * TOT_NF * 3);
-    int *l_locs = (int *)malloc(sizeof(int) * (GPU_LMAX + 2)); 
+    int *l_locs = (int *)malloc(sizeof(int) * (GPU_LMAX + 2));
     GINTinit_index1d_xyz(idx, l_locs);
     checkCudaErrors(cudaMemcpyToSymbol(c_idx, idx, sizeof(int) * TOT_NF*3));
     checkCudaErrors(cudaMemcpyToSymbol(c_l_locs, l_locs, sizeof(int) * (GPU_LMAX + 2)));
@@ -111,7 +110,7 @@ int GINTbuild_int3c2e_ip2_jk(BasisProdCache *bpcache,
 
     for (int cp_ij_id = 0; cp_ij_id < ncp_ij; cp_ij_id++){
         int n_stream = cp_ij_id % MAX_STREAMS;
-        
+
         GINTEnvVars envs;
         ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id;
         GINTinit_EnvVars(&envs, cp_ij, cp_kl, ng);
@@ -123,6 +122,7 @@ int GINTbuild_int3c2e_ip2_jk(BasisProdCache *bpcache,
         int ntasks_ij = bins_locs_ij[cp_ij_id+1] - bins_locs_ij[cp_ij_id];
         if (ntasks_ij <= 0) continue;
 
+        BasisProdOffsets offsets;
         offsets.ntasks_ij = ntasks_ij;
         offsets.ntasks_kl = ntasks_kl;
         offsets.bas_ij = bas_pairs_locs[cp_ij_id];
@@ -131,7 +131,7 @@ int GINTbuild_int3c2e_ip2_jk(BasisProdCache *bpcache,
         offsets.primitive_kl = primitive_pairs_locs[cp_kl_id];
 
         int err = GINTrun_tasks_int3c2e_ip2_jk(&jk, &offsets, &envs, streams[n_stream]);
-        
+
         if (err != 0) {
             return err;
         }
diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py
index 2b085674..987c7165 100644
--- a/gpu4pyscf/lib/tests/test_cupy_helper.py
+++ b/gpu4pyscf/lib/tests/test_cupy_helper.py
@@ -72,8 +72,7 @@ def test_sparse(self):
     def test_dist_matrix(self):
         a = cupy.random.rand(4, 3)
         rij = cupy.sum((a[:,None,:] - a[None,:,:])**2, axis=2)**0.5
-
-        rij0 = dist_matrix(a)
+        rij0 = dist_matrix(a, a)
         assert cupy.linalg.norm(rij - rij0) < 1e-10
 
     def test_takebak(self):
diff --git a/gpu4pyscf/lib/tests/test_cutensor.py b/gpu4pyscf/lib/tests/test_cutensor.py
index 7cbd08d5..ca338331 100644
--- a/gpu4pyscf/lib/tests/test_cutensor.py
+++ b/gpu4pyscf/lib/tests/test_cutensor.py
@@ -16,7 +16,7 @@
 import unittest
 import numpy
 import cupy
-from gpu4pyscf.lib.cupy_helper import *
+from gpu4pyscf.lib.cupy_helper import contract
 
 class KnownValues(unittest.TestCase):
     def test_contract(self):
@@ -49,7 +49,7 @@ def test_cache(self):
         c = contract('ijkl,jl->ik', a, b)
         c_einsum = cupy.einsum('ijkl,jl->ik', a, b)
         assert cupy.linalg.norm(c - c_einsum) < 1e-10
-    
+
 if __name__ == "__main__":
     print("Full tests for cutensor module")
     unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index cf6cda8f..a2149c4d 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -24,9 +24,10 @@
 from pyscf import lib
 from pyscf import gto, df
 from pyscf.grad import rhf as rhf_grad
+from pyscf.solvent import ddcosmo_grad
+
 from gpu4pyscf.solvent.pcm import PI, switch_h
 from gpu4pyscf.df import int3c2e
-
 from gpu4pyscf.lib.cupy_helper import contract
 from gpu4pyscf.lib import logger
 
@@ -193,7 +194,6 @@ def grad_qv(pcmobj, dm):
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False)
     coeff = intopt.coeff
     dm_cart = coeff @ dm @ coeff.T
-    #dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
 
     dvj, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip1', q_sym, None, dm_cart)
     dq, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip2', q_sym, None, dm_cart)
@@ -232,10 +232,10 @@ def grad_solver(pcmobj, dm):
 
     dF, dA = get_dF_dA(pcmobj.surface)
 
-    with_D = pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE']
+    with_D = pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']
     dD, dS, dSii = get_dD_dS(pcmobj.surface, dF, with_D=with_D, with_S=True)
 
-    if pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE']:
+    if pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']:
         DA = D*A
 
     epsilon = pcmobj.eps
@@ -253,7 +253,7 @@ def grad_solver(pcmobj, dm):
         de -= cupy.asarray([cupy.sum(de_dS[p0:p1], axis=0) for p0,p1, in gridslice])
         de -= 0.5*contract('i,xij->jx', vK_1*q, dSii) # 0.5*cupy.einsum('i,xij,i->jx', vK_1, dSii, q)
 
-    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE']:
+    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']:
         dD = dD.transpose([2,0,1])
         dS = dS.transpose([2,0,1])
         dSii = dSii.transpose([2,0,1])
@@ -315,41 +315,51 @@ def contract_ket(a, B, c):
     return de.get()
 
 def make_grad_object(grad_method):
-    '''
-    return solvent gradient object
-    '''
-    grad_method_class = grad_method.__class__
-    class WithSolventGrad(grad_method_class):
-        def __init__(self, grad_method):
-            self.__dict__.update(grad_method.__dict__)
-            self.de_solvent = None
-            self.de_solute = None
-            self._keys = self._keys.union(['de_solvent', 'de_solute'])
-
-        def kernel(self, *args, dm=None, atmlst=None, **kwargs):
-            dm = kwargs.pop('dm', None)
-            if dm is None:
-                dm = self.base.make_rdm1(ao_repr=True)
-
-            self.de_solvent = grad_qv(self.base.with_solvent, dm)
-            self.de_solvent+= grad_solver(self.base.with_solvent, dm)
-            self.de_solvent+= grad_nuc(self.base.with_solvent, dm)
-
-            self.de_solute = grad_method_class.kernel(self, *args, **kwargs)
-            self.de = self.de_solute + self.de_solvent
-
-            if self.verbose >= logger.NOTE:
-                logger.note(self, '--------------- %s (+%s) gradients ---------------',
-                            self.base.__class__.__name__,
-                            self.base.with_solvent.__class__.__name__)
-                rhf_grad._write(self, self.mol, self.de, self.atmlst)
-                logger.note(self, '----------------------------------------------')
-            return self.de
-
-        def _finalize(self):
-            # disable _finalize. It is called in grad_method.kernel method
-            # where self.de was not yet initialized.
-            pass
-
-    return WithSolventGrad(grad_method)
-
+    '''For grad_method in vacuum, add nuclear gradients of solvent pcmobj'''
+    if grad_method.base.with_solvent.frozen:
+        raise RuntimeError('Frozen solvent model is not avialbe for energy gradients')
+
+    name = (grad_method.base.with_solvent.__class__.__name__
+            + grad_method.__class__.__name__)
+    return lib.set_class(WithSolventGrad(grad_method),
+                         (WithSolventGrad, grad_method.__class__), name)
+
+class WithSolventGrad:
+    _keys = {'de_solvent', 'de_solute'}
+
+    def __init__(self, grad_method):
+        self.__dict__.update(grad_method.__dict__)
+        self.de_solvent = None
+        self.de_solute = None
+
+    def undo_solvent(self):
+        cls = self.__class__
+        name_mixin = self.base.with_solvent.__class__.__name__
+        obj = lib.view(self, lib.drop_class(cls, WithSolventGrad, name_mixin))
+        del obj.de_solvent
+        del obj.de_solute
+        return obj
+
+    def kernel(self, *args, dm=None, atmlst=None, **kwargs):
+        dm = kwargs.pop('dm', None)
+        if dm is None:
+            dm = self.base.make_rdm1(ao_repr=True)
+
+        self.de_solute = super().kernel(*args, **kwargs)
+        self.de_solvent = grad_qv(self.base.with_solvent, dm)
+        self.de_solvent+= grad_solver(self.base.with_solvent, dm)
+        self.de_solvent+= grad_nuc(self.base.with_solvent, dm)
+        self.de = self.de_solute + self.de_solvent
+
+        if self.verbose >= logger.NOTE:
+            logger.note(self, '--------------- %s (+%s) gradients ---------------',
+                        self.base.__class__.__name__,
+                        self.base.with_solvent.__class__.__name__)
+            rhf_grad._write(self, self.mol, self.de, self.atmlst)
+            logger.note(self, '----------------------------------------------')
+        return self.de
+
+    def _finalize(self):
+        # disable _finalize. It is called in grad_method.kernel method
+        # where self.de was not yet initialized.
+        pass
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/grad/smd.py b/gpu4pyscf/solvent/grad/smd.py
new file mode 100644
index 00000000..60e25a22
--- /dev/null
+++ b/gpu4pyscf/solvent/grad/smd.py
@@ -0,0 +1,279 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+Gradient of PCM family solvent model
+'''
+# pylint: disable=C0103
+
+import numpy as np
+import cupy
+#from cupyx import scipy, jit
+from pyscf import lib
+from pyscf import gto, df
+from pyscf.grad import rhf as rhf_grad
+from pyscf.data import radii
+from pyscf.solvent import ddcosmo_grad
+
+from gpu4pyscf.solvent import pcm, smd
+from gpu4pyscf.solvent.grad import pcm as pcm_grad
+from gpu4pyscf.solvent.smd import (
+    sigma_water, sigma_n, sigma_alpha, sigma_beta, r_zz, swtich_function,
+    hartree2kcal)
+from gpu4pyscf.df import int3c2e
+from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.lib import logger
+
+def grad_swtich_function(R, r, dr):
+    if R < r + dr:
+        return -np.exp(dr/(R-dr-r)) * dr / (R-dr-r)**2
+    else:
+        return 0.0
+
+def atomic_surface_tension(symbols, coords, n, alpha, beta, water=True):
+    '''
+    - list of atomic symbols
+    - atomic coordinates in Anstrong
+    - solvent descriptors: n, alpha, beta
+    '''
+
+    def get_bond_tension(bond):
+        if water:
+            return sigma_water.get(bond, 0.0)
+        t = 0.0
+        t += sigma_n.get(bond, 0.0) * n
+        t += sigma_alpha.get(bond, 0.0) * alpha
+        t += sigma_beta.get(bond, 0.0) * beta
+        return t
+
+    def get_atom_tension(sym_i):
+        if water:
+            return sigma_water.get(sym_i, 0.0)
+        t = 0.0
+        t += sigma_n.get(sym_i, 0.0) * n
+        t += sigma_alpha.get(sym_i, 0.0) * alpha
+        t += sigma_beta.get(sym_i, 0.0) * beta
+        return t
+    natm = coords.shape[0]
+    ri_rj = coords[:,None,:] - coords[None,:,:]
+    rij = np.sum(ri_rj**2, axis=2)**0.5
+    np.fill_diagonal(rij, 1)
+    drij = ri_rj / np.expand_dims(rij, axis=-1)
+    tensions = []
+    for i, sym_i in enumerate(symbols):
+        if sym_i not in ['H', 'C', 'N', 'O', 'F', 'Si', 'S', 'Cl', 'Br']:
+            tensions.append(np.zeros([natm,3]))
+            continue
+
+        tension = np.zeros([natm,3])
+        if sym_i in ['F', 'Si', 'S', 'Cl', 'Br']:
+            tensions.append(tension)
+            continue
+
+        if sym_i == 'H':
+            dt_HC = np.zeros([natm,3])
+            dt_HO = np.zeros([natm,3])
+            for j, sym_j in enumerate(symbols):
+                if sym_j == 'C':
+                    r, dr = r_zz.get(('H','C'), (0.0, 0.0))
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_HC[i] += dt_drij
+                    dt_HC[j] -= dt_drij
+                if sym_j == 'O':
+                    r, dr = r_zz.get(('H','O'), (0.0, 0.0))
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_HO[i] += dt_drij
+                    dt_HO[j] -= dt_drij
+            sig_HC = get_bond_tension(('H','C'))
+            sig_HO = get_bond_tension(('H','O'))
+            tension += sig_HC * dt_HC + sig_HO * dt_HO
+            tensions.append(tension)
+            continue
+
+        if sym_i == 'C':
+            dt_CC = np.zeros([natm,3])
+            dt_CN = np.zeros([natm,3])
+            t_CN = 0.0
+            for j, sym_j in enumerate(symbols):
+                if sym_j == 'C' and i != j:
+                    r, dr = r_zz.get(('C', 'C'), (0.0, 0.0))
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_CC[i] += dt_drij
+                    dt_CC[j] -= dt_drij
+                if sym_j == 'N':
+                    r, dr = r_zz.get(('C', 'N'), (0.0, 0.0))
+                    t_CN += swtich_function(rij[i,j], r, dr)
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_CN[i] += dt_drij
+                    dt_CN[j] -= dt_drij
+            sig_CC = get_bond_tension(('C','C'))
+            sig_CN = get_bond_tension(('C','N'))
+            tension += sig_CC * dt_CC + sig_CN * (2 * t_CN * dt_CN)
+            tensions.append(tension)
+            continue
+
+        if sym_i == 'N':
+            t_NC = 0.0
+            dt_NC = np.zeros([natm,3])
+            dt_NC3 = np.zeros([natm,3])
+            for j, sym_j in enumerate(symbols):
+                if sym_j == 'C':
+                    r, dr = r_zz.get(('N','C'), (0.0, 0.0))
+                    tk = 0.0
+                    dtk = np.zeros([natm,3])
+                    for k, sym_k in enumerate(symbols):
+                        if k != i and k != j:
+                            rjk, drjk = r_zz.get(('C', sym_k), (0.0, 0.0))
+                            tk += swtich_function(rij[j,k], rjk, drjk)
+                            dtk_rjk = grad_swtich_function(rij[j,k], rjk, drjk) * drij[j,k]
+                            dtk[j] += dtk_rjk
+                            dtk[k] -= dtk_rjk
+
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j] * tk**2
+                    dt_NC[i] += dt_drij
+                    dt_NC[j] -= dt_drij
+
+                    t = swtich_function(rij[i,j], r, dr)
+                    dt_NC += t * (2 * tk * dtk)
+                    t_NC += t * tk**2
+
+                    r, dr = r_zz.get(('N','C3'), (0.0, 0.0))
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_NC3[i] += dt_drij
+                    dt_NC3[j] -= dt_drij
+            sig_NC = get_bond_tension(('N','C'))
+            sig_NC3= get_bond_tension(('N','C3'))
+            tension += sig_NC * (1.3 * t_NC**0.3 * dt_NC) + sig_NC3 * dt_NC3
+            tensions.append(tension)
+            continue
+
+        if sym_i == 'O':
+            dt_OC = np.zeros([natm,3])
+            dt_ON = np.zeros([natm,3])
+            dt_OO = np.zeros([natm,3])
+            dt_OP = np.zeros([natm,3])
+            for j, sym_j in enumerate(symbols):
+                if sym_j == 'C':
+                    r, dr = r_zz.get(('O','C'), (0.0, 0.0))
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_OC[i] += dt_drij
+                    dt_OC[j] -= dt_drij
+                if sym_j == 'N':
+                    r, dr = r_zz.get(('O','N'), (0.0, 0.0))
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_ON[i] += dt_drij
+                    dt_ON[j] -= dt_drij
+                if sym_j == 'O' and j != i:
+                    r, dr = r_zz.get(('O','O'), (0.0, 0.0))
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_OO[i] += dt_drij
+                    dt_OO[j] -= dt_drij
+                if sym_j == 'P':
+                    r, dr = r_zz.get(('O','P'), (0.0, 0.0))
+                    dt_drij = grad_swtich_function(rij[i,j], r, dr) * drij[i,j]
+                    dt_OP[i] += dt_drij
+                    dt_OP[j] -= dt_drij
+            sig_OC = get_bond_tension(('O','C'))
+            sig_ON = get_bond_tension(('O','N'))
+            sig_OO = get_bond_tension(('O','O'))
+            sig_OP = get_bond_tension(('O','P'))
+            tension += sig_OC * dt_OC + sig_ON * dt_ON + sig_OO * dt_OO + sig_OP * dt_OP
+            tensions.append(tension)
+            continue
+
+    return cupy.asarray(tensions)
+
+def get_cds(smdobj):
+    mol = smdobj.mol
+    n, _, alpha, beta, gamma, _, phi, psi = smdobj.solvent_descriptors
+    symbols = [mol.atom_symbol(ia) for ia in range(mol.natm)]
+    coords = mol.atom_coords(unit='A')
+    if smdobj._solvent.lower() != 'water':
+        grad_tension = atomic_surface_tension(symbols, coords, n, alpha, beta, water=False)
+        atm_tension = smd.atomic_surface_tension(symbols, coords, n, alpha, beta, water=False)
+        mol_tension = smd.molecular_surface_tension(beta, gamma, phi, psi)
+    else:
+        grad_tension = atomic_surface_tension(symbols, coords, n, alpha, beta, water=True)
+        atm_tension = smd.atomic_surface_tension(symbols, coords, n, alpha, beta, water=True)
+        mol_tension = 0.0
+
+    # generate surface for calculating SASA
+    rad = radii.VDW + 0.4/radii.BOHR
+    surface = pcm.gen_surface(mol, ng=smdobj.sasa_ng, rad=rad)
+    _, grad_area = pcm_grad.get_dF_dA(surface)
+    area = surface['area']
+    gridslice = surface['gslice_by_atom']
+    SASA = cupy.asarray([cupy.sum(area[p0:p1], axis=0) for p0,p1, in gridslice]).get()
+    grad_SASA = cupy.asarray([cupy.sum(grad_area[p0:p1], axis=0) for p0,p1, in gridslice]).get()
+    SASA *= radii.BOHR**2
+    grad_SASA *= radii.BOHR**2
+    mol_cds = mol_tension * np.sum(grad_SASA, axis=0) / 1000
+    grad_tension *= radii.BOHR
+    atm_cds = np.einsum('i,ijx->jx', SASA, grad_tension.get()) / 1000
+    atm_cds+= np.einsum('i,ijx->jx', atm_tension.get(), grad_SASA) / 1000
+    return (mol_cds + atm_cds)/hartree2kcal # hartree
+
+def make_grad_object(grad_method):
+    '''For grad_method in vacuum, add nuclear gradients of solvent pcmobj'''
+    if grad_method.base.with_solvent.frozen:
+        raise RuntimeError('Frozen solvent model is not avialbe for energy gradients')
+
+    name = (grad_method.base.with_solvent.__class__.__name__
+            + grad_method.__class__.__name__)
+    return lib.set_class(WithSolventGrad(grad_method),
+                         (WithSolventGrad, grad_method.__class__), name)
+
+class WithSolventGrad:
+    _keys = {'de_solvent', 'de_solute'}
+
+    def __init__(self, grad_method):
+        self.__dict__.update(grad_method.__dict__)
+        self.de_solvent = None
+        self.de_solute = None
+
+    def undo_solvent(self):
+        cls = self.__class__
+        name_mixin = self.base.with_solvent.__class__.__name__
+        obj = lib.view(self, lib.drop_class(cls, WithSolventGrad, name_mixin))
+        del obj.de_solvent
+        del obj.de_solute
+        return obj
+
+    def kernel(self, *args, dm=None, atmlst=None, **kwargs):
+        dm = kwargs.pop('dm', None)
+        if dm is None:
+            dm = self.base.make_rdm1(ao_repr=True)
+
+        self.de_solute = super().kernel(*args, **kwargs)
+        self.de_solvent = pcm_grad.grad_qv(self.base.with_solvent, dm)
+        self.de_solvent+= pcm_grad.grad_solver(self.base.with_solvent, dm)
+        self.de_solvent+= pcm_grad.grad_nuc(self.base.with_solvent, dm)
+
+        self.de = self.de_solute + self.de_solvent
+        self.de += get_cds(self.base.with_solvent)
+        if self.verbose >= logger.NOTE:
+            logger.note(self, '--------------- %s (+%s) gradients ---------------',
+                        self.base.__class__.__name__,
+                        self.base.with_solvent.__class__.__name__)
+            rhf_grad._write(self, self.mol, self.de, self.atmlst)
+            logger.note(self, '----------------------------------------------')
+        return self.de
+
+    def _finalize(self):
+        # disable _finalize. It is called in grad_method.kernel method
+        # where self.de was not yet initialized.
+        pass
+
+
diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py
index ae28942c..4c5c4ba7 100644
--- a/gpu4pyscf/solvent/hessian/pcm.py
+++ b/gpu4pyscf/solvent/hessian/pcm.py
@@ -131,7 +131,11 @@ def pcm_grad_scanner(mol):
         pcmobj.reset(mol)
         e, v = pcmobj._get_vind(dm)
         #return grad_elec(pcmobj, dm)
-        return grad_nuc(pcmobj, dm) + grad_solver(pcmobj, dm) + grad_qv(pcmobj, dm)
+        pcm_grad = grad_nuc(pcmobj, dm)
+        pcm_grad+= grad_solver(pcmobj, dm)
+        pcm_grad+= grad_qv(pcmobj, dm)
+        return pcm_grad
+
     mol.verbose = 0
     de = numpy.zeros([mol.natm, mol.natm, 3, 3])
     eps = 1e-3
@@ -224,44 +228,49 @@ def analytic_grad_vmat(pcmobj, mo_coeff, mo_occ, atmlst=None, verbose=None):
     pcmobj.reset(pmol)
     return vmat
 """
+
 def make_hess_object(hess_method):
-    '''
-    return solvent hessian object
-    '''
-    hess_method_class = hess_method.__class__
-    class WithSolventHess(hess_method_class):
-        def __init__(self, hess_method):
-            self.__dict__.update(hess_method.__dict__)
-            self.de_solvent = None
-            self.de_solute = None
-            self._keys = self._keys.union(['de_solvent', 'de_solute'])
-
-        def kernel(self, *args, dm=None, atmlst=None, **kwargs):
-            dm = kwargs.pop('dm', None)
-            if dm is None:
-                dm = self.base.make_rdm1(ao_repr=True)
-            is_equilibrium = self.base.with_solvent.equilibrium_solvation
-            self.base.with_solvent.equilibrium_solvation = True
-            self.de_solvent = hess_elec(self.base.with_solvent, dm, verbose=self.verbose)
-            #self.de_solvent+= hess_nuc(self.base.with_solvent)
-            self.de_solute = hess_method_class.kernel(self, *args, **kwargs)
-            self.de = self.de_solute + self.de_solvent
-            self.base.with_solvent.equilibrium_solvation = is_equilibrium
-            return self.de
-
-        def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
-            if atmlst is None:
-                atmlst = range(self.mol.natm)
-            h1ao = hess_method_class.make_h1(self, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
-            dv = fd_grad_vmat(self.base.with_solvent, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
-            for i0, ia in enumerate(atmlst):
-                h1ao[i0] += dv[i0]
-            return h1ao
-
-        def _finalize(self):
-            # disable _finalize. It is called in grad_method.kernel method
-            # where self.de was not yet initialized.
-            pass
-
-    return WithSolventHess(hess_method)
+    if hess_method.base.with_solvent.frozen:
+        raise RuntimeError('Frozen solvent model is not avialbe for energy hessian')
+
+    name = (hess_method.base.with_solvent.__class__.__name__
+            + hess_method.__class__.__name__)
+    return lib.set_class(WithSolventHess(hess_method),
+                         (WithSolventHess, hess_method.__class__), name)
+
+class WithSolventHess:
+    _keys = {'de_solvent', 'de_solute'}
+
+    def __init__(self, hess_method):
+        self.__dict__.update(hess_method.__dict__)
+        self.de_solvent = None
+        self.de_solute = None
+
+    def kernel(self, *args, dm=None, atmlst=None, **kwargs):
+        dm = kwargs.pop('dm', None)
+        if dm is None:
+            dm = self.base.make_rdm1(ao_repr=True)
+        is_equilibrium = self.base.with_solvent.equilibrium_solvation
+        self.base.with_solvent.equilibrium_solvation = True
+        self.de_solvent = hess_elec(self.base.with_solvent, dm, verbose=self.verbose)
+        #self.de_solvent+= hess_nuc(self.base.with_solvent)
+        self.de_solute = super().kernel(*args, **kwargs)
+        self.de = self.de_solute + self.de_solvent
+        self.base.with_solvent.equilibrium_solvation = is_equilibrium
+        return self.de
+
+    def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
+        if atmlst is None:
+            atmlst = range(self.mol.natm)
+        h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+        dv = fd_grad_vmat(self.base.with_solvent, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+        for i0, ia in enumerate(atmlst):
+            h1ao[i0] += dv[i0]
+        return h1ao
+
+    def _finalize(self):
+        # disable _finalize. It is called in grad_method.kernel method
+        # where self.de was not yet initialized.
+        pass
+
 
diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py
new file mode 100644
index 00000000..af41595b
--- /dev/null
+++ b/gpu4pyscf/solvent/hessian/smd.py
@@ -0,0 +1,303 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+Gradient of PCM family solvent model
+'''
+# pylint: disable=C0103
+
+import numpy as np
+import cupy
+#from cupyx import scipy, jit
+from pyscf import lib
+from pyscf import gto, df
+from pyscf.grad import rhf as rhf_grad
+from pyscf.data import radii
+from pyscf.solvent import ddcosmo_grad
+
+from gpu4pyscf.solvent import pcm, smd
+from gpu4pyscf.solvent.grad import pcm as pcm_grad
+from gpu4pyscf.solvent.grad import smd as smd_grad
+from gpu4pyscf.solvent.hessian import pcm as pcm_hess
+
+from gpu4pyscf.solvent.smd import (
+    sigma_water, sigma_n, sigma_alpha, sigma_beta, r_zz, swtich_function,
+    hartree2kcal)
+from gpu4pyscf.df import int3c2e
+from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.lib import logger
+
+def hess_swtich_function(R, r, dr):
+    if R < r + dr:
+        dist = (R[0]**2 + R[1]**2 + R[2]**2)**.5
+        hess = [
+            [R[1]**2+R[2]**2, -R[0]*R[1], -R[0]*R[2]],
+            [-R[0]*R[1], R[0]**2+R[2]**2, -R[1]*R[2]],
+            [-R[0]*R[2], -R[1]*R[2], R[0]**2+R[1]**2]]
+        return np.asarray(hess)/dist
+    else:
+        return np.zeros([3,3])
+
+def atomic_surface_tension(symbols, coords, n, alpha, beta, water=True):
+    '''
+    TODO: debug later
+    - list of atomic symbols
+    - atomic coordinates in Anstrong
+    - solvent descriptors: n, alpha, beta
+    '''
+
+    def get_bond_tension(bond):
+        if water:
+            return sigma_water.get(bond, 0.0)
+        t = 0.0
+        t += sigma_n.get(bond, 0.0) * n
+        t += sigma_alpha.get(bond, 0.0) * alpha
+        t += sigma_beta.get(bond, 0.0) * beta
+        return t
+
+    def get_atom_tension(sym_i):
+        if water:
+            return sigma_water.get(sym_i, 0.0)
+        t = 0.0
+        t += sigma_n.get(sym_i, 0.0) * n
+        t += sigma_alpha.get(sym_i, 0.0) * alpha
+        t += sigma_beta.get(sym_i, 0.0) * beta
+        return t
+    natm = coords.shape[0]
+    ri_rj = coords[:,None,:] - coords[None,:,:]
+    rij = np.sum(ri_rj**2, axis=2)**0.5
+    np.fill_diagonal(rij, 1)
+    #drij = ri_rj / np.expand_dims(rij, axis=-1)
+    tensions = []
+    for i, sym_i in enumerate(symbols):
+        if sym_i not in ['H', 'C', 'N', 'O', 'F', 'Si', 'S', 'Cl', 'Br']:
+            tensions.append(np.zeros([natm,3]))
+            continue
+
+        tension = np.zeros([natm,3])
+        if sym_i in ['F', 'Si', 'S', 'Cl', 'Br']:
+            tensions.append(tension)
+            continue
+
+        if sym_i == 'H':
+            dt_HC = np.zeros([natm,natm,3,3])
+            dt_HO = np.zeros([natm,natm,3,3])
+            for j, sym_j in enumerate(symbols):
+                if sym_j == 'C':
+                    r, dr = r_zz.get(('H','C'), (0.0, 0.0))
+                    dt_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    dt_HC[i,i] += dt_drij
+                    dt_HC[i,j] -= dt_drij
+                    dt_HC[j,i] -= dt_drij
+                    dt_HC[j,j] += dt_drij
+                if sym_j == 'O':
+                    r, dr = r_zz.get(('H','O'), (0.0, 0.0))
+                    dt_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    dt_HO[i,i] += dt_drij
+                    dt_HO[i,j] -= dt_drij
+                    dt_HO[j,i] -= dt_drij
+                    dt_HO[j,j] += dt_drij
+            sig_HC = get_bond_tension(('H','C'))
+            sig_HO = get_bond_tension(('H','O'))
+            tension += sig_HC * dt_HC + sig_HO * dt_HO
+            tensions.append(tension)
+
+        if sym_i == 'C':
+            dt_CN = np.zeros([natm,3])
+            d2t_CC = np.zeros([natm,natm,3,3])
+            d2t_CN = np.zeros([natm,natm,3,3])
+            t_CN = 0.0
+            for j, sym_j in enumerate(symbols):
+                if sym_j == 'C' and i != j:
+                    r, dr = r_zz.get(('C', 'C'), (0.0, 0.0))
+                    d2t_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    d2t_CC[i,i] += d2t_drij
+                    d2t_CC[i,j] -= d2t_drij
+                    d2t_CC[j,i] -= d2t_drij
+                    d2t_CC[j,j] += d2t_drij
+                if sym_j == 'N':
+                    r, dr = r_zz.get(('C', 'N'), (0.0, 0.0))
+                    t_CN += swtich_function(rij[i,j], r, dr)
+                    dt_drij = smd_grad.grad_switch_function(coords[i]-coords[j], r, dr)
+                    dt_CN[i] += dt_drij
+                    dt_CN[j] -= dt_drij
+                    d2t_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    d2t_CN[i,i] += d2t_drij
+                    d2t_CN[i,j] -= d2t_drij
+                    d2t_CN[j,i] -= d2t_drij
+                    d2t_CN[j,j] += d2t_drij
+            sig_CC = get_bond_tension(('C','C'))
+            sig_CN = get_bond_tension(('C','N'))
+            tension += sig_CC * d2t_CC + sig_CN * (2 * t_CN * d2t_CN + 2 * np.einsum('i,j->ij', dt_drij[i], dt_drij[j]))
+            tensions.append(tension)
+
+        if sym_i == 'N':
+            t_NC = 0.0
+            dt_NC = np.zeros([natm,natm,3,3])
+            dt_NC3 = np.zeros([natm,natm,3,3])
+            for j, sym_j in enumerate(symbols):
+                if sym_j == 'C':
+                    r, dr = r_zz.get(('N','C'), (0.0, 0.0))
+                    tk = 0.0
+                    dtk = np.zeros([natm,natm,3,3])
+                    for k, sym_k in enumerate(symbols):
+                        if k != i and k != j:
+                            rjk, drjk = r_zz.get(('C', sym_k), (0.0, 0.0))
+                            tk += swtich_function(rij[j,k], rjk, drjk)
+                            dtk_rjk = hess_swtich_function(coords[j]-coords[k], rjk, drjk)
+                            dtk[j,j] += dtk_rjk
+                            dtk[j,k] -= dtk_rjk
+                            dtk[k,j] -= dtk_rjk
+                            dtk[k,k] += dtk_rjk
+                    dt_drij = hess_swtich_function(coords[i]-coords[j], r, dr) * tk**2
+                    dt_NC[i,i] += dt_drij
+                    dt_NC[i,j] -= dt_drij
+                    dt_NC[j,i] -= dt_drij
+                    dt_NC[j,j] += dt_drij
+                    t = swtich_function(coords[i]-coords[j], r, dr)
+                    dt_NC += t * (2 * tk * dtk)
+                    t_NC += t * tk**2
+
+                    r, dr = r_zz.get(('N','C3'), (0.0, 0.0))
+                    dt_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    dt_NC3[i,i] += dt_drij
+                    dt_NC3[i,j] -= dt_drij
+                    dt_NC3[j,i] -= dt_drij
+                    dt_NC3[j,j] += dt_drij
+            sig_NC = get_bond_tension(('N','C'))
+            sig_NC3= get_bond_tension(('N','C3'))
+            tension += sig_NC * (1.3 * t_NC**0.3 * dt_NC) + sig_NC3 * dt_NC3
+            tensions.append(tension)
+
+        if sym_i == 'O':
+            dt_OC = np.zeros([natm,natm,3,3])
+            dt_ON = np.zeros([natm,natm,3,3])
+            dt_OO = np.zeros([natm,natm,3,3])
+            dt_OP = np.zeros([natm,natm,3,3])
+            for j, sym_j in enumerate(symbols):
+                if sym_j == 'C':
+                    r, dr = r_zz.get(('O','C'), (0.0, 0.0))
+                    dt_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    dt_OC[i,i] += dt_drij
+                    dt_OC[i,j] -= dt_drij
+                    dt_OC[j,i] -= dt_drij
+                    dt_OC[j,j] += dt_drij
+                if sym_j == 'N':
+                    r, dr = r_zz.get(('O','N'), (0.0, 0.0))
+                    dt_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    dt_ON[i,i] += dt_drij
+                    dt_ON[i,j] -= dt_drij
+                    dt_ON[j,i] -= dt_drij
+                    dt_ON[j,j] += dt_drij
+                if sym_j == 'O' and j != i:
+                    r, dr = r_zz.get(('O','O'), (0.0, 0.0))
+                    dt_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    dt_OO[i,i] += dt_drij
+                    dt_OO[i,j] -= dt_drij
+                    dt_OO[j,i] -= dt_drij
+                    dt_OO[j,j] += dt_drij
+                if sym_j == 'P':
+                    r, dr = r_zz.get(('O','P'), (0.0, 0.0))
+                    dt_drij = hess_swtich_function(coords[i]-coords[j], r, dr)
+                    dt_OP[i,i] += dt_drij
+                    dt_OP[i,j] -= dt_drij
+                    dt_OP[j,i] -= dt_drij
+                    dt_OP[j,j] += dt_drij
+            sig_OC = get_bond_tension(('O','C'))
+            sig_ON = get_bond_tension(('O','N'))
+            sig_OO = get_bond_tension(('O','O'))
+            sig_OP = get_bond_tension(('O','P'))
+            tension += sig_OC * dt_OC + sig_ON * dt_ON + sig_OO * dt_OO + sig_OP * dt_OP
+            tensions.append(tension)
+    return cupy.asarray(tensions)
+
+def get_cds(smdobj):
+    mol = smdobj.mol
+    solvent = smdobj.solvent
+    def smd_grad_scanner(mol):
+        smdobj_tmp = smd.SMD(mol)
+        smdobj_tmp.solvent = solvent
+        return smd_grad.get_cds(smdobj_tmp)
+
+    log = logger.new_logger(mol, mol.verbose)
+    t1 = log.init_timer()
+
+    eps = 1e-4
+    natm = mol.natm
+    hess_cds = np.zeros([natm,natm,3,3])
+    for ia in range(mol.natm):
+        for j in range(3):
+            coords = mol.atom_coords(unit='B')
+            coords[ia,j] += eps
+            mol.set_geom_(coords, unit='B')
+            mol.build()
+            grad0_cds = smd_grad_scanner(mol)
+
+            coords[ia,j] -= 2.0*eps
+            mol.set_geom_(coords, unit='B')
+            mol.build()
+            grad1_cds = smd_grad_scanner(mol)
+
+            coords[ia,j] += eps
+            mol.set_geom_(coords, unit='B')
+            hess_cds[ia,:,j] = (grad0_cds - grad1_cds) / (2.0 * eps)
+    t1 = log.timer_debug1('solvent energy', *t1)
+    return hess_cds # hartree
+
+def make_hess_object(hess_method):
+    '''For hess_method in vacuum, add nuclear Hessian of solvent smdobj'''
+    if hess_method.base.with_solvent.frozen:
+        raise RuntimeError('Frozen solvent model is not avialbe for energy hessian')
+
+    name = (hess_method.base.with_solvent.__class__.__name__
+            + hess_method.__class__.__name__)
+    return lib.set_class(WithSolventHess(hess_method),
+                         (WithSolventHess, hess_method.__class__), name)
+
+class WithSolventHess:
+    _keys = {'de_solvent', 'de_solute'}
+
+    def __init__(self, hess_method):
+        self.__dict__.update(hess_method.__dict__)
+        self.de_solvent = None
+        self.de_solute = None
+
+    def kernel(self, *args, dm=None, atmlst=None, **kwargs):
+        dm = kwargs.pop('dm', None)
+        if dm is None:
+            dm = self.base.make_rdm1(ao_repr=True)
+        is_equilibrium = self.base.with_solvent.equilibrium_solvation
+        self.base.with_solvent.equilibrium_solvation = True
+        self.de_solvent = pcm_hess.hess_elec(self.base.with_solvent, dm, verbose=self.verbose)
+        #self.de_solvent+= hess_nuc(self.base.with_solvent)
+        self.de_solute = super().kernel(*args, **kwargs)
+        self.de = self.de_solute + self.de_solvent
+        self.de += get_cds(self.base.with_solvent)
+        self.base.with_solvent.equilibrium_solvation = is_equilibrium
+        return self.de
+
+    def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
+        if atmlst is None:
+            atmlst = range(self.mol.natm)
+        h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+        dv = pcm_hess.fd_grad_vmat(self.base.with_solvent, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+        for i0, ia in enumerate(atmlst):
+            h1ao[i0] += dv[i0]
+        return h1ao
+
+    def _finalize(self):
+        # disable _finalize. It is called in grad_method.kernel method
+        # where self.de was not yet initialized.
+        pass
diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py
index 335de1f7..50e3957f 100644
--- a/gpu4pyscf/solvent/pcm.py
+++ b/gpu4pyscf/solvent/pcm.py
@@ -25,7 +25,7 @@
 from pyscf import gto, df
 from pyscf.dft import gen_grid
 from pyscf.data import radii
-from pyscf.solvent import ddcosmo
+from pyscf.solvent import ddcosmo#, _attach_solvent
 from gpu4pyscf.solvent import _attach_solvent
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib import logger
@@ -122,7 +122,8 @@ def gen_surface(mol, ng=302, rad=modified_Bondi, vdw_scale=1.2, r_probe=0.0):
 
         atom_grid = r_vdw * unit_sphere[:,:3] + atom_coords[ia,:]
         #riJ = scipy.spatial.distance.cdist(atom_grid[:,:3], atom_coords)
-        riJ = cupy.sum((atom_grid[:,None,:] - atom_coords[None,:,:])**2, axis=2)**0.5
+        #riJ = cupy.sum((atom_grid[:,None,:] - atom_coords[None,:,:])**2, axis=2)**0.5
+        riJ = dist_matrix(atom_grid, atom_coords)
         diJ = (riJ - R_in_J) / R_sw_J
         diJ[:,ia] = 1.0
         diJ[diJ<1e-8] = 0.0
@@ -193,7 +194,7 @@ def get_D_S(surface, with_S=True, with_D=False):
     xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5
     #rij = scipy.spatial.distance.cdist(grid_coords, grid_coords)
     #rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5
-    rij = dist_matrix(grid_coords)
+    rij = dist_matrix(grid_coords, grid_coords)
     xi_r_ij = xi_ij * rij
     cupy.fill_diagonal(rij, 1)
     S = scipy.special.erf(xi_r_ij) / rij
@@ -201,8 +202,13 @@ def get_D_S(surface, with_S=True, with_D=False):
 
     D = None
     if with_D:
-        drij = cupy.expand_dims(grid_coords, axis=1) - grid_coords
-        nrij = cupy.sum(drij * norm_vec, axis=-1)
+        #drij = cupy.expand_dims(grid_coords, axis=1) - grid_coords
+        #nri = cupy.sum(grid_coords * norm_vec, axis=-1)
+        #nrij = cupy.expand_dims(nri, axis=1) - nri
+        #cupy.expand_dims(grid_coords, axis=1) * norm_vec
+        nrij = grid_coords.dot(norm_vec.T) - cupy.sum(grid_coords * norm_vec, axis=-1)
+
+        #nrij = cupy.sum(drij * norm_vec, axis=-1)
 
         D = S*nrij/rij**2 -2.0*xi_r_ij/PI**0.5*cupy.exp(-xi_r_ij**2)*nrij/rij**3
         cupy.fill_diagonal(D, -charge_exp * (2.0 / PI)**0.5 / (2.0 * R_vdw))
diff --git a/gpu4pyscf/solvent/smd.py b/gpu4pyscf/solvent/smd.py
index 5fb5a8e0..1cc79f98 100644
--- a/gpu4pyscf/solvent/smd.py
+++ b/gpu4pyscf/solvent/smd.py
@@ -18,12 +18,14 @@
 '''
 
 import numpy as np
+import scipy
 import cupy
 from pyscf import lib
 from pyscf.data import radii
 from pyscf.dft import gen_grid
 from gpu4pyscf.solvent import pcm, _attach_solvent
 from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import dist_matrix
 
 @lib.with_doc(_attach_solvent._for_scf.__doc__)
 def smd_for_scf(mf, solvent_obj=None, dm=None):
@@ -353,17 +355,13 @@ def get_atom_tension(sym_i):
         t += sigma_beta.get(sym_i, 0.0) * beta
         return t
 
-    rij = cupy.sum((coords[:,None,:] - coords[None,:,:])**2, axis=2)**0.5
+    rij = scipy.spatial.distance.cdist(coords, coords)
     tensions = []
     for i, sym_i in enumerate(symbols):
         if sym_i not in ['H', 'C', 'N', 'O', 'F', 'Si', 'S', 'Cl', 'Br']:
             tensions.append(0)
             continue
 
-        #sig_n = sigma_n.get(sym_i, 0.0)
-        #sig_a = sigma_alpha.get(sym_i, 0.0)
-        #sig_b = sigma_beta.get(sym_i, 0.0)
-        #tension = sig_n * n + sig_a * alpha + sig_b * beta
         tension = get_atom_tension(sym_i)
         if sym_i in ['F', 'Si', 'S', 'Cl', 'Br']:
             tensions.append(tension)
@@ -374,50 +372,53 @@ def get_atom_tension(sym_i):
             t_HO = 0.0
             for j, sym_j in enumerate(symbols):
                 if sym_j == 'C':
-                    r, dr = r_zz['H','C']
+                    r, dr = r_zz.get(('H','C'), (0.0, 0.0))
                     t_HC += swtich_function(rij[i,j], r, dr)
                 if sym_j == 'O':
-                    r, dr = r_zz['H','O']
+                    r, dr = r_zz.get(('H','O'), (0.0, 0.0))
                     t_HO += swtich_function(rij[i,j], r, dr)
             sig_HC = get_bond_tension(('H','C'))
             sig_HO = get_bond_tension(('H','O'))
             tension += sig_HC * t_HC + sig_HO * t_HO
             tensions.append(tension)
+            continue
 
         if sym_i == 'C':
             t_CC = 0.0
             t_CN = 0.0
             for j, sym_j in enumerate(symbols):
                 if sym_j == 'C' and i != j:
-                    r, dr = r_zz['C', 'C']
+                    r, dr = r_zz.get(('C', 'C'), (0.0, 0.0))
                     t_CC += swtich_function(rij[i,j], r, dr)
                 if sym_j == 'N':
-                    r, dr = r_zz['C', 'N']
+                    r, dr = r_zz.get(('C', 'N'), (0.0, 0.0))
                     t_CN += swtich_function(rij[i,j], r, dr)
             sig_CC = get_bond_tension(('C','C'))
             sig_CN = get_bond_tension(('C','N'))
             tension += sig_CC * t_CC + sig_CN * t_CN**2
             tensions.append(tension)
+            continue
 
         if sym_i == 'N':
             t_NC = 0.0
             t_NC3 = 0.0
             for j, sym_j in enumerate(symbols):
                 if sym_j == 'C':
-                    r, dr = r_zz['N','C']
+                    r, dr = r_zz.get(('N','C'), (0.0,0.0))
                     tk = 0.0
                     for k, sym_k in enumerate(symbols):
                         if k != i and k != j:
-                            rjk, drjk = r_zz['C', sym_k]
+                            rjk, drjk = r_zz.get(('C', sym_k), (0.0,0.0))
                             tk += swtich_function(rij[j,k], rjk, drjk)
                     t_NC += swtich_function(rij[i,j], r, dr) * tk**2
 
-                    r, dr = r_zz['N','C3']
+                    r, dr = r_zz.get(('N','C3'), (0.0, 0.0))
                     t_NC3 += swtich_function(rij[i,j], r, dr)
             sig_NC = get_bond_tension(('N','C'))
             sig_NC3= get_bond_tension(('N','C3'))
             tension += sig_NC * t_NC**1.3 + sig_NC3 * t_NC3
             tensions.append(tension)
+            continue
 
         if sym_i == 'O':
             t_OC = 0.0
@@ -426,16 +427,16 @@ def get_atom_tension(sym_i):
             t_OP = 0.0
             for j, sym_j in enumerate(symbols):
                 if sym_j == 'C':
-                    r, dr = r_zz['O','C']
+                    r, dr = r_zz.get(('O','C'), (0.0, 0.0))
                     t_OC += swtich_function(rij[i,j], r, dr)
                 if sym_j == 'N':
-                    r, dr = r_zz['O','N']
+                    r, dr = r_zz.get(('O','N'), (0.0, 0.0))
                     t_ON += swtich_function(rij[i,j], r, dr)
                 if sym_j == 'O' and j != i:
-                    r, dr = r_zz['O','O']
+                    r, dr = r_zz.get(('O','O'), (0.0, 0.0))
                     t_OO += swtich_function(rij[i,j], r, dr)
                 if sym_j == 'P':
-                    r, dr = r_zz['O','P']
+                    r, dr = r_zz.get(('O','P'), (0.0, 0.0))
                     t_OP += swtich_function(rij[i,j], r, dr)
             sig_OC = get_bond_tension(('O','C'))
             sig_ON = get_bond_tension(('O','N'))
@@ -443,6 +444,7 @@ def get_atom_tension(sym_i):
             sig_OP = get_bond_tension(('O','P'))
             tension += sig_OC * t_OC + sig_ON * t_ON + sig_OO * t_OO + sig_OP * t_OP
             tensions.append(tension)
+            continue
     return cupy.asarray(tensions)
 
 def molecular_surface_tension(beta, gamma, phi, psi):
@@ -486,23 +488,23 @@ def get_cds(smdobj):
 
     # generate surface for calculating SASA
     rad = radii.VDW + 0.4/radii.BOHR
-    surface = pcm.gen_surface(mol, ng=302, rad=rad)
+    surface = pcm.gen_surface(mol, ng=smdobj.sasa_ng, rad=rad)
     area = surface['area']
     gridslice = surface['gslice_by_atom']
     SASA = cupy.asarray([cupy.sum(area[p0:p1], axis=0) for p0,p1, in gridslice])
     SASA *= radii.BOHR**2
     mol_cds = mol_tension * cupy.sum(SASA) / 1000 # in kcal/mol
     atm_cds = cupy.sum(SASA * atm_tension) / 1000 # in kcal/mol
-
     return (mol_cds + atm_cds)/hartree2kcal # hartree
 
 class SMD(pcm.PCM):
     _keys = {
-        'intopt', 'method', 'e_cds', 'solvent_descriptors', 'r_probe'
+        'intopt', 'method', 'e_cds', 'solvent_descriptors', 'r_probe', 'sasa_ng'
     }
     def __init__(self, mol, solvent=''):
         super().__init__(mol)
         self.vdw_scale = 1.0
+        self.sasa_ng = 590 # quadrature grids for calculating SASA
         self.r_probe = 0.4/radii.BOHR
         self.method = 'SMD'  # use IEFPCM for electrostatic
         if solvent not in solvent_db:
@@ -566,10 +568,24 @@ def get_cds(self):
         return get_cds(self)
 
     def nuc_grad_method(self, grad_method):
-        raise RuntimeError('SMD gradient is not implemented')
+        from gpu4pyscf.solvent.grad import smd as smd_grad
+        if self.frozen:
+            raise RuntimeError('Frozen solvent model is not supported')
+        from gpu4pyscf import scf
+        if isinstance(grad_method.base, scf.hf.RHF):
+            return smd_grad.make_grad_object(grad_method)
+        else:
+            raise RuntimeError('Only SCF gradient is supported')
 
     def Hessian(self, hess_method):
-        raise RuntimeError('SMD Hessian is not implemented')
+        from gpu4pyscf.solvent.hessian import smd as smd_hess
+        if self.frozen:
+            raise RuntimeError('Frozen solvent model is not supported')
+        from gpu4pyscf import scf
+        if isinstance(hess_method.base, scf.hf.RHF):
+            return smd_hess.make_hess_object(hess_method)
+        else:
+            raise RuntimeError('Only SCF gradient is supported')
 
     def reset(self, mol=None):
         super().reset(mol)
diff --git a/gpu4pyscf/solvent/tests/test_smd.py b/gpu4pyscf/solvent/tests/test_smd.py
index 45e2028a..b322f4ce 100644
--- a/gpu4pyscf/solvent/tests/test_smd.py
+++ b/gpu4pyscf/solvent/tests/test_smd.py
@@ -40,20 +40,23 @@ def tearDownModule():
 class KnownValues(unittest.TestCase):
     def test_cds_solvent(self):
         smdobj = smd.SMD(mol)
+        smdobj.sasa_ng = 590
         smdobj.solvent = 'toluene'
         e_cds = smdobj.get_cds()
-        assert numpy.abs(e_cds - -0.0013476060879874362) < 1e-8
+        assert numpy.abs(e_cds - -0.0013476016530476354) < 1e-8
 
     def test_cds_water(self):
         smdobj = smd.SMD(mol)
+        smdobj.sasa_ng = 590
         smdobj.solvent = 'water'
         e_cds = smdobj.get_cds()
-        assert numpy.abs(e_cds - 0.0022847142144050057) < 1e-8
+        assert numpy.abs(e_cds - 0.0022903044356530726) < 1e-8
 
     def test_smd_solvent(self):
         mf = scf.RHF(mol)
         mf = mf.SMD()
         mf.with_solvent.solvent = 'ethanol'
+        mf.with_solvent.sasa_ng = 590
         e_tot = mf.kernel()
         assert numpy.abs(e_tot - -76.075066568) < 2e-4
 
@@ -61,9 +64,12 @@ def test_smd_water(self):
         mf = scf.RHF(mol)
         mf = mf.SMD()
         mf.with_solvent.solvent = 'water'
+        mf.with_solvent.sasa_ng = 590
         e_tot = mf.kernel()
         assert numpy.abs(e_tot - -76.0756052903) < 2e-4
 
+    # TODO: add more test for other molecules
+
 if __name__ == "__main__":
     print("Full Tests for SMDs")
     unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/tests/test_smd_grad.py b/gpu4pyscf/solvent/tests/test_smd_grad.py
new file mode 100644
index 00000000..ec8173f7
--- /dev/null
+++ b/gpu4pyscf/solvent/tests/test_smd_grad.py
@@ -0,0 +1,87 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy
+from pyscf import gto
+from gpu4pyscf import scf
+from gpu4pyscf.solvent import pcm
+from gpu4pyscf.solvent.grad import pcm as pcm_grad
+from gpu4pyscf.solvent.grad import smd as smd_grad
+from gpu4pyscf.solvent import smd
+
+def setUpModule():
+    global mol
+    mol = gto.Mole()
+    mol.atom = '''P 0.000 0.000 0.000
+O 1.500 0.000 0.000
+O -1.500 0.000 0.000
+O 0.000 1.500 0.000
+O 0.000 -1.500 0.000
+H 1.000 1.000 0.000
+H -1.000 -1.000 0.000
+H 0.000 -2.000 0.000
+'''
+    mol.basis = 'sto3g'
+    mol.output = '/dev/null'
+    mol.build(verbose=0)
+
+def tearDownModule():
+    global mol
+    mol.stdout.close()
+    del mol
+
+def _check_grad(mol, solvent='water'):
+    natm = mol.natm
+    fd_cds = numpy.zeros([natm,3])
+    eps = 1e-4
+    for ia in range(mol.natm):
+        for j in range(3):
+            coords = mol.atom_coords(unit='B')
+            coords[ia,j] += eps
+            mol.set_geom_(coords, unit='B')
+            mol.build()
+
+            smdobj = smd.SMD(mol)
+            smdobj.solvent = solvent
+            e0_cds = smdobj.get_cds()
+
+            coords[ia,j] -= 2.0*eps
+            mol.set_geom_(coords, unit='B')
+            mol.build()
+
+            smdobj = smd.SMD(mol)
+            smdobj.solvent = solvent
+            e1_cds = smdobj.get_cds()
+
+            coords[ia,j] += eps
+            mol.set_geom_(coords, unit='B')
+            fd_cds[ia,j] = (e0_cds - e1_cds) / (2.0 * eps)
+
+    smdobj = smd.SMD(mol)
+    smdobj.solvent = solvent
+    grad_cds = smd_grad.get_cds(smdobj)
+    assert numpy.linalg.norm(fd_cds - grad_cds) < 1e-8
+
+class KnownValues(unittest.TestCase):
+    def test_grad_water(self):
+        _check_grad(mol, solvent='water')
+
+    def test_grad_solvent(self):
+        _check_grad(mol, solvent='ethanol')
+
+if __name__ == "__main__":
+    print("Full Tests for Gradient of SMD")
+    unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/solvent/tests/test_smd_hessian.py b/gpu4pyscf/solvent/tests/test_smd_hessian.py
new file mode 100644
index 00000000..24193663
--- /dev/null
+++ b/gpu4pyscf/solvent/tests/test_smd_hessian.py
@@ -0,0 +1,58 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import numpy
+from pyscf import gto
+from gpu4pyscf import scf
+from gpu4pyscf.solvent.hessian import smd as smd_hess
+from gpu4pyscf.solvent import smd
+
+def setUpModule():
+    global mol
+    mol = gto.Mole()
+    mol.atom = '''P 0.000 0.000 0.000
+O 1.500 0.000 0.000
+O -1.500 0.000 0.000
+O 0.000 1.500 0.000
+O 0.000 -1.500 0.000
+H 1.000 1.000 0.000
+H -1.000 -1.000 0.000
+H 0.000 -2.000 0.000
+'''
+    mol.basis = 'sto3g'
+    mol.output = '/dev/null'
+    mol.build(verbose=0)
+
+def tearDownModule():
+    global mol
+    mol.stdout.close()
+    del mol
+
+def _check_hess(mol, solvent='water'):
+    smdobj = smd.SMD(mol)
+    smdobj.solvent = solvent
+    smd_hess.get_cds(smdobj)
+
+class KnownValues(unittest.TestCase):
+    def test_hess_water(self):
+        _check_hess(mol, solvent='water')
+
+    def test_hess_solvent(self):
+        _check_hess(mol, solvent='ethanol')
+
+if __name__ == "__main__":
+    print("Full Tests for Hessian of SMD")
+    unittest.main()
\ No newline at end of file