efficient PCM model (pyscf#150)

* efficient PCM model * flake8 * pcm_d_s.cu -> pcm.cu * improve pcm cuda code * uncomment pcm * uncomment pcm * fixed bug in pcm model
bytedance · May 12, 2024 · c4fba3c · c4fba3c
1 parent 0509a00
commit c4fba3c
Show file tree

Hide file tree

Showing 7 changed files with 292 additions and 17 deletions.
diff --git a/gpu4pyscf/lib/CMakeLists.txt b/gpu4pyscf/lib/CMakeLists.txt
@@ -117,6 +117,7 @@ add_subdirectory(gint)
 add_subdirectory(gvhf)
 add_subdirectory(gdft)
 add_subdirectory(cupy_helper)
+add_subdirectory(solvent)
 
 option(BUILD_LIBXC "Using libxc for DFT" ON)
 if(BUILD_LIBXC)

diff --git a/gpu4pyscf/lib/solvent/CMakeLists.txt b/gpu4pyscf/lib/solvent/CMakeLists.txt
@@ -0,0 +1,25 @@
+# gpu4pyscf is a plugin to use Nvidia GPU in PySCF package
+#
+# Copyright (C) 2022 Qiming Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_80")
+
+add_library(solvent SHARED
+  pcm.cu
+)
+
+set_target_properties(solvent PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
+set_target_properties(solvent PROPERTIES CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
diff --git a/gpu4pyscf/lib/solvent/pcm.cu b/gpu4pyscf/lib/solvent/pcm.cu
@@ -0,0 +1,166 @@
+/* Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#define THREADS        32
+#define SQRT2_PI       0.7978845608028654
+#define SQRT_PI        1.7724538509055159
+
+// D and S matrix in J. Chem. Phys. 133, 244111 (2010)
+__global__
+static void _pcm_d_s(double *matrix_d, double *matrix_s,
+                    const double *coords, const double *norm_vec, const double *r_vdw,
+                    const double *charge_exp, const double *switch_fun,
+                    int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= n || j >= n){
+        return;
+    }
+
+    // calculate xi
+    double ei = charge_exp[i];
+    double ej = charge_exp[j];
+    double xi_ij = ei * ej / sqrt(ei*ei + ej*ej);
+
+    // calculate r
+    double xi = coords[3*i];
+    double yi = coords[3*i+1];
+    double zi = coords[3*i+2];
+    double xj = coords[3*j];
+    double yj = coords[3*j+1];
+    double zj = coords[3*j+2];
+    double dx = xi - xj;
+    double dy = yi - yj;
+    double dz = zi - zj;
+    double rij = norm3d(dx, dy, dz);
+
+    double xi_r_ij = xi_ij * rij;
+    if (i == j) rij = 1.0;
+    double s = erf(xi_r_ij) / rij;
+    if (i == j) s = charge_exp[i] * SQRT2_PI / switch_fun[i];
+    matrix_s[i*n+j] = s;
+
+    if (matrix_d != NULL){
+        double nxj = norm_vec[3*j];
+        double nyj = norm_vec[3*j+1];
+        double nzj = norm_vec[3*j+2];
+
+        double nrij = 0.0;
+        nrij += (xi - xj) * nxj;
+        nrij += (yi - yj) * nyj;
+        nrij += (zi - zj) * nzj;
+
+        double rij2 = rij*rij;
+        double rij3 = rij2*rij;
+        double xi_r2_ij = xi_r_ij * xi_r_ij;
+        double d = s * nrij / rij2 - 2.0*xi_r_ij/SQRT_PI*exp(-xi_r2_ij)*nrij/rij3;
+        if (i == j) d = -charge_exp[i] * SQRT2_PI / (2.0*r_vdw[i]);
+        matrix_d[i*n+j] = d;
+    }
+}
+
+__global__
+static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds,
+                    const double *coords, const double *norm_vec, const double *r_vdw,
+                    const double *charge_exp, const double *switch_fun,
+                    int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= n || j >= n){
+        return;
+    }
+
+    // calculate xi
+    double ei = charge_exp[i];
+    double ej = charge_exp[j];
+    double xi_ij = ei * ej / sqrt(ei*ei + ej*ej);
+
+    // calculate r
+    double dx = coords[3*i]   - coords[3*j];
+    double dy = coords[3*i+1] - coords[3*j+1];
+    double dz = coords[3*i+2] - coords[3*j+2];
+    double rij = norm3d(dx, dy, dz);
+
+    double xi_r_ij = xi_ij * rij;
+    double xi_r2_ij = xi_r_ij * xi_r_ij;
+    if (i == j) rij = 1.0;
+    double rij2 = rij*rij;
+
+    double dS_dr = -(erf(xi_r_ij) -  2.0*xi_r_ij/ SQRT_PI * exp(-xi_r2_ij)) / rij2;
+    if (i == j) dS_dr = 0.0;
+    double dx_rij = dx / rij;
+    double dy_rij = dy / rij;
+    double dz_rij = dz / rij;
+
+    matrix_ds[3*(i*n+j)]   = dS_dr * dx_rij;
+    matrix_ds[3*(i*n+j)+1] = dS_dr * dy_rij;
+    matrix_ds[3*(i*n+j)+2] = dS_dr * dz_rij;
+
+    if (matrix_dd != NULL){
+        double nxj = norm_vec[3*j];
+        double nyj = norm_vec[3*j+1];
+        double nzj = norm_vec[3*j+2];
+        double nj_rij = dx*nxj + dy*nyj + dz*nzj;
+        double rij3 = rij2*rij;
+        double dD_dri = 4.0*xi_r2_ij*xi_ij / SQRT_PI*exp(-xi_r2_ij)*nj_rij/rij3;
+        if (i == j) dD_dri = 0.0;
+
+        matrix_dd[3*(i*n+j)]   = dD_dri*dx_rij + dS_dr*(-nxj/rij + 3.0*nj_rij/rij2*dx_rij);
+        matrix_dd[3*(i*n+j)+1] = dD_dri*dy_rij + dS_dr*(-nyj/rij + 3.0*nj_rij/rij2*dy_rij);
+        matrix_dd[3*(i*n+j)+2] = dD_dri*dz_rij + dS_dr*(-nzj/rij + 3.0*nj_rij/rij2*dz_rij);
+    }
+}
+
+extern "C" {
+int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s,
+                    const double *coords, const double *norm_vec, const double *r_vdw,
+                    const double *charge_exp, const double *switch_fun,
+                    int n)
+{
+    int ntilex = (n + THREADS - 1) / THREADS;
+    int ntiley = (n + THREADS - 1) / THREADS;
+    dim3 threads(THREADS, THREADS);
+    dim3 blocks(ntilex, ntiley);
+    _pcm_d_s<<<blocks, threads, 0, stream>>>(matrix_d, matrix_s, coords, norm_vec, r_vdw, charge_exp, switch_fun, n);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return 1;
+    }
+    return 0;
+}
+
+int pcm_dd_ds(cudaStream_t stream, double *matrix_dD, double *matrix_dS,
+                    const double *coords, const double *norm_vec, const double *r_vdw,
+                    const double *charge_exp, const double *switch_fun,
+                    int n)
+{
+    int ntilex = (n + THREADS - 1) / THREADS;
+    int ntiley = (n + THREADS - 1) / THREADS;
+    dim3 threads(THREADS, THREADS);
+    dim3 blocks(ntilex, ntiley);
+    _pcm_dD_dS<<<blocks, threads, 0, stream>>>(matrix_dD, matrix_dS, coords, norm_vec, r_vdw, charge_exp, switch_fun, n);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return 1;
+    }
+    return 0;
+}
+}
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
@@ -20,17 +20,20 @@
 
 import numpy
 import cupy
+import ctypes
 from cupyx import scipy
 from pyscf import lib
 from pyscf import gto
 from pyscf.grad import rhf as rhf_grad
 
 from gpu4pyscf.solvent.pcm import PI, switch_h
 from gpu4pyscf.df import int3c2e
-from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.lib.cupy_helper import contract, load_library
 from gpu4pyscf.lib import logger
+from pyscf import lib as pyscf_lib
 
 libdft = lib.load_library('libdft')
+libsolvent = load_library('libsolvent')
 
 def grad_switch_h(x):
     ''' first derivative of h(x)'''
@@ -98,7 +101,7 @@ def get_dF_dA(surface):
 
     return dF, dA
 
-def get_dD_dS(surface, dF, with_S=True, with_D=False):
+def get_dD_dS_slow(surface, dF, with_S=True, with_D=False):
     '''
     derivative of D and S w.r.t grids, partial_i D_ij = -partial_j D_ij
     S is symmetric, D is not
@@ -141,6 +144,39 @@ def get_dD_dS(surface, dF, with_S=True, with_D=False):
 
     return dD, dS, dSii
 
+def get_dD_dS(surface, dF, with_S=True, with_D=False, stream=None):
+    charge_exp  = surface['charge_exp']
+    grid_coords = surface['grid_coords']
+    switch_fun  = surface['switch_fun']
+    norm_vec    = surface['norm_vec']
+    R_vdw       = surface['R_vdw']
+    n = charge_exp.shape[0]
+    dS = cupy.empty([n,n,3])
+    dD = None
+    dS_ptr = ctypes.cast(dS.data.ptr, ctypes.c_void_p)
+    dD_ptr = pyscf_lib.c_null_ptr()
+    if with_D:
+        dD = cupy.empty([n,n,3])
+        dD_ptr = ctypes.cast(dD.data.ptr, ctypes.c_void_p)
+    if stream is None:
+        stream = cupy.cuda.get_current_stream()
+    err = libsolvent.pcm_dd_ds(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        dD_ptr, dS_ptr,
+        ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p),
+        ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p),
+        ctypes.cast(R_vdw.data.ptr, ctypes.c_void_p),
+        ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
+        ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(n)
+    )
+    if err != 0:
+        raise RuntimeError('Failed in generating PCM dD and dS matrices.')
+
+    dSii_dF = -charge_exp * (2.0/PI)**0.5 / switch_fun**2
+    dSii = cupy.expand_dims(dSii_dF, axis=(1,2)) * dF
+    return dD, dS, dSii
+
 def grad_nuc(pcmobj, dm):
     mol = pcmobj.mol
     log = logger.new_logger(mol, mol.verbose)

diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py
@@ -29,9 +29,10 @@
 from gpu4pyscf.solvent import _attach_solvent
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib import logger
-from gpu4pyscf.lib.cupy_helper import dist_matrix
+from gpu4pyscf.lib.cupy_helper import dist_matrix, load_library
 
 libdft = lib.load_library('libdft')
+libsolvent = load_library('libsolvent')
 
 @lib.with_doc(_attach_solvent._for_scf.__doc__)
 def pcm_for_scf(mf, solvent_obj=None, dm=None):
@@ -179,7 +180,7 @@ def get_F_A(surface):
     A = weights*R_vdw**2*switch_fun
     return switch_fun, A
 
-def get_D_S(surface, with_S=True, with_D=False):
+def get_D_S_slow(surface, with_S=True, with_D=False):
     '''
     generate D and S matrix in  J. Chem. Phys. 133, 244111 (2010)
     The diagonal entries of S is not filled
@@ -192,8 +193,6 @@ def get_D_S(surface, with_S=True, with_D=False):
 
     xi_i, xi_j = cupy.meshgrid(charge_exp, charge_exp, indexing='ij')
     xi_ij = xi_i * xi_j / (xi_i**2 + xi_j**2)**0.5
-    #rij = scipy.spatial.distance.cdist(grid_coords, grid_coords)
-    #rij = cupy.sum((grid_coords[:,None,:] - grid_coords[None,:,:])**2, axis=2)**0.5
     rij = dist_matrix(grid_coords, grid_coords)
     xi_r_ij = xi_ij * rij
     cupy.fill_diagonal(rij, 1)
@@ -202,17 +201,40 @@ def get_D_S(surface, with_S=True, with_D=False):
 
     D = None
     if with_D:
-        #drij = cupy.expand_dims(grid_coords, axis=1) - grid_coords
-        #nri = cupy.sum(grid_coords * norm_vec, axis=-1)
-        #nrij = cupy.expand_dims(nri, axis=1) - nri
-        #cupy.expand_dims(grid_coords, axis=1) * norm_vec
         nrij = grid_coords.dot(norm_vec.T) - cupy.sum(grid_coords * norm_vec, axis=-1)
-
-        #nrij = cupy.sum(drij * norm_vec, axis=-1)
-
         D = S*nrij/rij**2 -2.0*xi_r_ij/PI**0.5*cupy.exp(-xi_r_ij**2)*nrij/rij**3
         cupy.fill_diagonal(D, -charge_exp * (2.0 / PI)**0.5 / (2.0 * R_vdw))
+    return D, S
 
+def get_D_S(surface, with_S=True, with_D=False, stream=None):
+    ''' Efficiently generating D matrix and S matrix in PCM models '''
+    charge_exp  = surface['charge_exp']
+    grid_coords = surface['grid_coords']
+    switch_fun  = surface['switch_fun']
+    norm_vec    = surface['norm_vec']
+    R_vdw       = surface['R_vdw']
+    n = charge_exp.shape[0]
+    S = cupy.empty([n,n])
+    D = None
+    S_ptr = ctypes.cast(S.data.ptr, ctypes.c_void_p)
+    D_ptr = lib.c_null_ptr()
+    if with_D:
+        D = cupy.empty([n,n])
+        D_ptr = ctypes.cast(D.data.ptr, ctypes.c_void_p)
+    if stream is None:
+        stream = cupy.cuda.get_current_stream()
+    err = libsolvent.pcm_d_s(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        D_ptr, S_ptr,
+        ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p),
+        ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p),
+        ctypes.cast(R_vdw.data.ptr, ctypes.c_void_p),
+        ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
+        ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(n)
+    )
+    if err != 0:
+        raise RuntimeError('Failed in generating PCM D and S matrices.')
     return D, S
 
 class PCM(lib.StreamObject):
@@ -301,7 +323,6 @@ def build(self, ng=None):
         atom_coords = mol.atom_coords(unit='B')
         atom_charges = mol.atom_charges()
 
-        # Move this to GPU
         auxmol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2)
         intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
         intopt.build(1e-14, diag_block_with_triu=False, aosym=True, group_size=256)