diff --git a/.gitignore b/.gitignore
index 9a680274..b7163d3f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,8 @@ output/
 *.swo
 *.swp
 *~
+*.log
+*.h5
 **/qchem_input.in
 #*
 dockerfiles/manylinux/wheelhouse
diff --git a/README.md b/README.md
index b7100771..69da2288 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ Features
 - Dispersion corrections via [DFTD3](https://github.com/dftd3/simple-dftd3) and [DFTD4](https://github.com/dftd4/dftd4);
 - Nonlocal functional correction (vv10) for SCF and gradient;
 - ECP is supported and calculated on CPU;
-- PCM solvent models and their analytical gradients;
+- PCM solvent models, analytical gradients, and semi-analytical Hessian matrix;
 
 Limitations
 --------
@@ -68,7 +68,7 @@ Examples
 import pyscf
 from gpu4pyscf.dft import rks
 
-atom =''' 
+atom ='''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
 H       0.7570000000     0.0000000000    -0.4696000000
@@ -91,7 +91,7 @@ Find more examples in gpu4pyscf/examples
 
 Benchmarks
 --------
-Speedup with GPU4PySCF v0.6.0 over Q-Chem 6.1 (Desity fitting, SCF, def2-tzvpp, def2-universal-jkfit, (99,590))
+Speedup with GPU4PySCF v0.6.0 on A100-80G over Q-Chem 6.1 on 32-cores CPU (Desity fitting, SCF, def2-tzvpp, def2-universal-jkfit, B3LYP, (99,590))
 
 | mol               |   natm |    LDA |    PBE |   B3LYP |    M06 |   wB97m-v |
 |:------------------|-------:|-------:|-------:|--------:|-------:|----------:|
diff --git a/benchmarks/df/dft_driver.py b/benchmarks/df/dft_driver.py
index 69cc7621..71000e45 100644
--- a/benchmarks/df/dft_driver.py
+++ b/benchmarks/df/dft_driver.py
@@ -1,3 +1,18 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
 import os
 import csv
 import pyscf
@@ -88,16 +103,16 @@ def run_dft(path, filename):
 
     hess_time = -1
     if args.with_hessian:
-        #try:
-        start_time = time.time()
-        h = mf.Hessian()
-        h.auxbasis_response = 1
-        h.max_memory = 40000
-        hess = h.kernel().reshape([3*mol.natm, 3*mol.natm])
-        hess_time = time.time() - start_time
-        #except Exception:
-        #   hess_time = -1
-        #    hess = -1
+        try:
+            start_time = time.time()
+            h = mf.Hessian()
+            h.auxbasis_response = 1
+            h.max_memory = 40000
+            hess = h.kernel().reshape([3*mol.natm, 3*mol.natm])
+            hess_time = time.time() - start_time
+        except Exception:
+            hess_time = -1
+            hess = -1
 
     np.savez(args.output_path+filename+'.npz', e_dft=e_dft, grad=f, hess=hess)
     return mol.natm, mol.nao, scf_time, grad_time, hess_time, e_dft
diff --git a/benchmarks/df/generate_tables.ipynb b/benchmarks/df/generate_tables.ipynb
deleted file mode 100644
index 322939e5..00000000
--- a/benchmarks/df/generate_tables.ipynb
+++ /dev/null
@@ -1,552 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 122,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Density fitting with difference basis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 125,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 124,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| mol               |   natm |   sto-3g |   6-31g |   def2-svp |   def2-tzvpp |   def2-tzvpd |\n",
-      "|:------------------|-------:|---------:|--------:|-----------:|-------------:|-------------:|\n",
-      "| 020_Vitamin_C     |     20 |     0.92 |    1.5  |       2.13 |         5.94 |         8.36 |\n",
-      "| 031_Inosine       |     31 |     4.74 |    7.12 |      10.98 |        17.02 |        21.17 |\n",
-      "| 033_Bisphenol_A   |     33 |     4.53 |    6.24 |       7    |        16.55 |        20.96 |\n",
-      "| 037_Mg_Porphin    |     37 |     7.38 |    9.9  |      13.88 |        16.9  |        23.39 |\n",
-      "| 042_Penicillin_V  |     42 |     5.9  |    8.19 |      11.43 |        16.41 |        20.11 |\n",
-      "| 045_Ochratoxin_A  |     45 |     6.94 |   10.06 |      12.9  |        15.33 |        21.62 |\n",
-      "| 052_Cetirizine    |     52 |     7.15 |    9.86 |      13.85 |        17.34 |        23.24 |\n",
-      "| 057_Tamoxifen     |     57 |     7.48 |    8.95 |      13.19 |        19.26 |        24.22 |\n",
-      "| 066_Raffinose     |     66 |     8.22 |   10.12 |      14.98 |        15.28 |        16.1  |\n",
-      "| 084_Sphingomyelin |     84 |   nan    |    9.69 |      14.83 |        17.82 |        20.33 |\n",
-      "| 095_Azadirachtin  |     95 |    16.06 |   17.18 |      24.22 |        23.29 |       nan    |\n",
-      "| 113_Taxol         |    113 |    20.11 |   18.04 |      23.38 |        24    |       nan    |\n",
-      "| 168_Valinomycin   |    168 |    23.43 |   19.41 |     nan    |       nan    |       nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "A100_file = 'NVIDIA A100-SXM4-80GB.csv'\n",
-    "V100_file = 'Tesla V100-SXM2-32GB.csv'\n",
-    "qchem_file = 'qchem-32-cores-cpu.csv'\n",
-    "\n",
-    "keys = ['mol', 'natm']\n",
-    "empty = {'mol':[], 'natm':[]}\n",
-    "df_A100_scf = pd.DataFrame(empty)\n",
-    "df_V100_scf = pd.DataFrame(empty)\n",
-    "df_A100_grad = pd.DataFrame(empty)\n",
-    "df_V100_grad = pd.DataFrame(empty)\n",
-    "path = 'organic/basis/'\n",
-    "\n",
-    "for basis in ['sto-3g', '6-31g', 'def2-svp', 'def2-tzvpp', 'def2-tzvpd']:\n",
-    "    df_qchem = pd.read_csv(path + basis + '/' + qchem_file)\n",
-    "    df_qchem = df_qchem.rename(columns={'t_scf':'scf_qchem', 't_gradient':'grad_qchem'})\n",
-    "    \n",
-    "    df_A100 = pd.read_csv(path + basis + '/' + A100_file)\n",
-    "    df_A100 = df_A100.rename(columns={'t_scf':'scf_A100', 't_gradient':'grad_A100'})\n",
-    "    df_A100 = df_A100.merge(df_qchem, how='outer', on='mol')\n",
-    "    \n",
-    "    df_A100['scf_'+basis] = df_A100['scf_qchem']/df_A100['scf_A100']\n",
-    "    df_A100['grad_'+basis] = df_A100['grad_qchem']/df_A100['grad_A100']\n",
-    "    df_A100 = df_A100[keys+['scf_'+basis, 'grad_'+basis]]\n",
-    "    \n",
-    "    df_A100_scf = df_A100_scf.merge(df_A100[keys+['scf_'+basis]], how='outer', on=keys)\n",
-    "    df_A100_grad= df_A100_grad.merge(df_A100[keys+['grad_'+basis]], how='outer', on=keys)\n",
-    "    df_A100_scf = df_A100_scf.rename(columns={'scf_'+basis:basis})\n",
-    "    df_A100_grad = df_A100_grad.rename(columns={'grad_'+basis:basis})\n",
-    "    df_A100_scf[basis] = df_A100_scf[basis].apply(lambda x: round(x,2))\n",
-    "    df_A100_grad[basis] = df_A100_grad[basis].apply(lambda x: round(x,2))\n",
-    "\n",
-    "    df_V100 = pd.read_csv(path + basis + '/' + V100_file)\n",
-    "    df_V100 = df_V100.rename(columns={'t_scf':'scf_V100', 't_gradient':'grad_V100'})\n",
-    "    df_V100 = df_V100.merge(df_qchem, how='outer', on='mol')\n",
-    "    df_V100['scf_'+basis] = df_V100['scf_qchem']/df_V100['scf_V100']\n",
-    "    df_V100['grad_'+basis] = df_V100['grad_qchem']/df_V100['grad_V100']\n",
-    "\n",
-    "    df_V100_scf = df_V100_scf.merge(df_V100[keys+['scf_'+basis,]], how='outer', on=keys)\n",
-    "    df_V100_grad= df_V100_grad.merge(df_V100[keys+['grad_'+basis]], how='outer', on=keys)\n",
-    "    df_V100_scf = df_V100_scf.rename(columns={'scf_'+basis:basis})\n",
-    "    \n",
-    "    df_V100_grad = df_V100_grad.rename(columns={'grad_'+basis:basis})\n",
-    "    df_V100_scf[basis] = df_V100_scf[basis].apply(lambda x: round(x,2))\n",
-    "    df_V100_grad[basis] = df_V100_grad[basis].apply(lambda x: round(x,2))\n",
-    "    \n",
-    "print(df_A100_scf.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 127,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 126,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| mol               |   natm |   sto-3g |   6-31g |   def2-svp |   def2-tzvpp |   def2-tzvpd |\n",
-      "|:------------------|-------:|---------:|--------:|-----------:|-------------:|-------------:|\n",
-      "| 020_Vitamin_C     |     20 |     0.52 |    0.93 |       1.23 |         3.98 |         4.88 |\n",
-      "| 031_Inosine       |     31 |     0.97 |    1.92 |       3.03 |         6.79 |         8.19 |\n",
-      "| 033_Bisphenol_A   |     33 |     1.16 |    1.89 |       2.09 |         6.72 |         8.31 |\n",
-      "| 037_Mg_Porphin    |     37 |     1.79 |    3.55 |       4.49 |         7.64 |        10.55 |\n",
-      "| 042_Penicillin_V  |     42 |     1.37 |    2.62 |       3.63 |         7.69 |         9.24 |\n",
-      "| 045_Ochratoxin_A  |     45 |     1.58 |    3.23 |       4.12 |         7.27 |         9.88 |\n",
-      "| 052_Cetirizine    |     52 |     1.83 |    3.61 |       4.72 |         8.63 |        11.32 |\n",
-      "| 057_Tamoxifen     |     57 |     1.92 |    3.3  |       4.59 |         9.72 |         7.87 |\n",
-      "| 066_Raffinose     |     66 |     2.31 |    4.04 |       5.75 |         6.09 |         5.54 |\n",
-      "| 084_Sphingomyelin |     84 |   nan    |    3.29 |       4.92 |         7.32 |         8    |\n",
-      "| 095_Azadirachtin  |     95 |     4.63 |    8.46 |      10.55 |        13.83 |       nan    |\n",
-      "| 113_Taxol         |    113 |     6.55 |   10.1  |       9.43 |        12.31 |       nan    |\n",
-      "| 168_Valinomycin   |    168 |     9.23 |   11.66 |     nan    |       nan    |       nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(df_V100_scf.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 129,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 128,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| mol               |   natm |   sto-3g |   6-31g |   def2-svp |   def2-tzvpp |   def2-tzvpd |\n",
-      "|:------------------|-------:|---------:|--------:|-----------:|-------------:|-------------:|\n",
-      "| 020_Vitamin_C     |     20 |     3.13 |    4.12 |       5.98 |         9.71 |        10.7  |\n",
-      "| 031_Inosine       |     31 |    11.42 |    9.82 |      13.23 |        16.47 |        16.16 |\n",
-      "| 033_Bisphenol_A   |     33 |    13.28 |   10.3  |      11.94 |        16.08 |        16.02 |\n",
-      "| 037_Mg_Porphin    |     37 |    13.75 |   10.54 |      15.87 |        18.33 |        19.89 |\n",
-      "| 042_Penicillin_V  |     42 |    13.3  |   10.7  |      14.07 |        17.2  |        18.81 |\n",
-      "| 045_Ochratoxin_A  |     45 |    14.68 |   11.33 |      16.28 |        19.79 |        20.94 |\n",
-      "| 052_Cetirizine    |     52 |    21.46 |   14.62 |      19.55 |        20.51 |        21.93 |\n",
-      "| 057_Tamoxifen     |     57 |    20.97 |   16.37 |      18.78 |        20.27 |        21.96 |\n",
-      "| 066_Raffinose     |     66 |    25.4  |   17.78 |      25.71 |        23.88 |        22.38 |\n",
-      "| 084_Sphingomyelin |     84 |   nan    |   17.46 |      20.9  |        23.64 |        26.52 |\n",
-      "| 095_Azadirachtin  |     95 |    39.13 |   32.27 |      40.78 |        39.94 |       nan    |\n",
-      "| 113_Taxol         |    113 |    48.57 |   42.77 |      51.57 |        49.03 |       nan    |\n",
-      "| 168_Valinomycin   |    168 |    87.81 |   72.58 |     nan    |       nan    |       nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(df_A100_grad.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 131,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 130,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| mol               |   natm |   sto-3g |   6-31g |   def2-svp |   def2-tzvpp |   def2-tzvpd |\n",
-      "|:------------------|-------:|---------:|--------:|-----------:|-------------:|-------------:|\n",
-      "| 020_Vitamin_C     |     20 |     1.43 |    2.4  |       3.3  |         5.46 |         5.54 |\n",
-      "| 031_Inosine       |     31 |     3.01 |    4.22 |       5.14 |         7.06 |         6.84 |\n",
-      "| 033_Bisphenol_A   |     33 |     3.31 |    4.18 |       4.38 |         6.75 |         6.89 |\n",
-      "| 037_Mg_Porphin    |     37 |     4.13 |    5.08 |       6.42 |         7.89 |         8.54 |\n",
-      "| 042_Penicillin_V  |     42 |     4.05 |    5.06 |       5.89 |         7.88 |         8.39 |\n",
-      "| 045_Ochratoxin_A  |     45 |     4.59 |    5.42 |       6.8  |         8.6  |         8.93 |\n",
-      "| 052_Cetirizine    |     52 |     6.11 |    7.04 |       7.97 |         9.13 |         9.53 |\n",
-      "| 057_Tamoxifen     |     57 |     6.17 |    8.05 |       7.74 |         9.3  |         9.22 |\n",
-      "| 066_Raffinose     |     66 |     7.9  |    9.49 |      10.82 |        10.51 |         9.58 |\n",
-      "| 084_Sphingomyelin |     84 |   nan    |    7.64 |       7.99 |         9.56 |        10.39 |\n",
-      "| 095_Azadirachtin  |     95 |    13.3  |   17.59 |      16.25 |        17.93 |       nan    |\n",
-      "| 113_Taxol         |    113 |    17.55 |   23.43 |      20.81 |        21.54 |       nan    |\n",
-      "| 168_Valinomycin   |    168 |    31.21 |   38.79 |     nan    |       nan    |       nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(df_V100_grad.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Density fitting with different xc"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 133,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 132,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| mol               |   natm |    LDA |    PBE |   B3LYP |    M06 |   wB97m-v |\n",
-      "|:------------------|-------:|-------:|-------:|--------:|-------:|----------:|\n",
-      "| 020_Vitamin_C     |     20 |   2.86 |   6.09 |   13.11 |  11.58 |     17.46 |\n",
-      "| 031_Inosine       |     31 |  13.14 |  15.87 |   16.57 |  25.89 |     26.14 |\n",
-      "| 033_Bisphenol_A   |     33 |  12.31 |  16.88 |   16.54 |  28.45 |     28.82 |\n",
-      "| 037_Mg_Porphin    |     37 |  13.85 |  19.03 |   20.53 |  28.31 |     30.27 |\n",
-      "| 042_Penicillin_V  |     42 |  10.34 |  13.35 |   15.34 |  22.01 |     24.2  |\n",
-      "| 045_Ochratoxin_A  |     45 |  13.34 |  15.3  |   19.66 |  27.08 |     25.41 |\n",
-      "| 052_Cetirizine    |     52 |  17.79 |  17.44 |   19    |  24.41 |     25.87 |\n",
-      "| 057_Tamoxifen     |     57 |  14.7  |  16.57 |   18.4  |  24.86 |     25.47 |\n",
-      "| 066_Raffinose     |     66 |  13.77 |  14.2  |   20.47 |  22.94 |     25.35 |\n",
-      "| 084_Sphingomyelin |     84 |  14.24 |  12.82 |   15.96 |  22.11 |     24.46 |\n",
-      "| 095_Azadirachtin  |     95 |   5.58 |   7.72 |   24.18 |  26.84 |     25.21 |\n",
-      "| 113_Taxol         |    113 |   5.44 |   6.81 |   24.58 |  29.14 |    nan    |\n",
-      "| 168_Valinomycin   |    168 | nan    | nan    |  nan    | nan    |    nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "keys = ['mol', 'natm']\n",
-    "empty = {'mol':[], 'natm':[]}\n",
-    "df_A100_scf = pd.DataFrame(empty)\n",
-    "df_V100_scf = pd.DataFrame(empty)\n",
-    "df_A100_grad = pd.DataFrame(empty)\n",
-    "df_V100_grad = pd.DataFrame(empty)\n",
-    "path = 'organic/xc/'\n",
-    "for xc in ['LDA', 'PBE', 'B3LYP', 'M06', 'wB97m-v']:\n",
-    "    df_qchem = pd.read_csv(path + xc + '/' + qchem_file)\n",
-    "    df_qchem = df_qchem.rename(columns={'t_scf':'scf_qchem', 't_gradient':'grad_qchem'})\n",
-    "    \n",
-    "    df_A100 = pd.read_csv(path + xc + '/' + A100_file)\n",
-    "    df_A100 = df_A100.rename(columns={'t_scf':'scf_A100', 't_gradient':'grad_A100'})\n",
-    "    df_A100 = df_A100.merge(df_qchem, how='outer', on='mol')\n",
-    "    \n",
-    "    df_A100['scf_'+xc] = df_A100['scf_qchem']/df_A100['scf_A100']\n",
-    "    df_A100['grad_'+xc] = df_A100['grad_qchem']/df_A100['grad_A100']\n",
-    "    df_A100 = df_A100[keys+['scf_'+xc, 'grad_'+xc]]\n",
-    "    \n",
-    "    df_A100_scf = df_A100_scf.merge(df_A100[keys+['scf_'+xc]], how='outer', on=keys)\n",
-    "    df_A100_grad= df_A100_grad.merge(df_A100[keys+['grad_'+xc]], how='outer', on=keys)\n",
-    "    df_A100_scf = df_A100_scf.rename(columns={'scf_'+xc:xc})\n",
-    "    df_A100_grad = df_A100_grad.rename(columns={'grad_'+xc:xc})\n",
-    "    df_A100_scf[xc] = df_A100_scf[xc].apply(lambda x: round(x,2))\n",
-    "    df_A100_grad[xc] = df_A100_grad[xc].apply(lambda x: round(x,2))\n",
-    "\n",
-    "    df_V100 = pd.read_csv(path + xc + '/' + V100_file)\n",
-    "    df_V100 = df_V100.rename(columns={'t_scf':'scf_V100', 't_gradient':'grad_V100'})\n",
-    "    df_V100 = df_V100.merge(df_qchem, how='outer', on='mol')\n",
-    "    df_V100['scf_'+xc] = df_V100['scf_qchem']/df_V100['scf_V100']\n",
-    "    df_V100['grad_'+xc] = df_V100['grad_qchem']/df_V100['grad_V100']\n",
-    "\n",
-    "    df_V100_scf = df_V100_scf.merge(df_V100[keys+['scf_'+xc,]], how='outer', on=keys)\n",
-    "    df_V100_grad= df_V100_grad.merge(df_V100[keys+['grad_'+xc]], how='outer', on=keys)\n",
-    "    df_V100_scf = df_V100_scf.rename(columns={'scf_'+xc:xc})\n",
-    "    \n",
-    "    df_V100_grad = df_V100_grad.rename(columns={'grad_'+xc:xc})\n",
-    "    df_V100_scf[xc] = df_V100_scf[xc].apply(lambda x: round(x,2))\n",
-    "    df_V100_grad[xc] = df_V100_grad[xc].apply(lambda x: round(x,2))\n",
-    "\n",
-    "print(df_A100_scf.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 135,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 134,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| mol               |   natm |   LDA |   PBE |   B3LYP |   M06 |   wB97m-v |\n",
-      "|:------------------|-------:|------:|------:|--------:|------:|----------:|\n",
-      "| 020_Vitamin_C     |     20 |  1.89 |  3.3  |    8.18 |  5.95 |     10.58 |\n",
-      "| 031_Inosine       |     31 |  4.64 |  5.95 |    6.41 |  9.48 |     13.15 |\n",
-      "| 033_Bisphenol_A   |     33 |  4.85 |  6.64 |    6.58 | 11.04 |     14.72 |\n",
-      "| 037_Mg_Porphin    |     37 |  5.61 |  8.6  |    9.01 | 12.34 |     16.56 |\n",
-      "| 042_Penicillin_V  |     42 |  4.36 |  6.17 |    7.09 | 10.62 |     14.28 |\n",
-      "| 045_Ochratoxin_A  |     45 |  5.47 |  6.97 |    8.74 | 12.05 |     14.14 |\n",
-      "| 052_Cetirizine    |     52 |  8.43 |  8.51 |    9.16 | 12.44 |     15.37 |\n",
-      "| 057_Tamoxifen     |     57 |  6.79 |  8.41 |    9.98 | 13.44 |     15.67 |\n",
-      "| 066_Raffinose     |     66 |  3.22 |  4.31 |    8.11 | 10.58 |     13.22 |\n",
-      "| 084_Sphingomyelin |     84 |  3.34 |  3.97 |    6.52 |  8.63 |     12.11 |\n",
-      "| 095_Azadirachtin  |     95 |  3.35 |  4.74 |   14.29 | 16.52 |     15.05 |\n",
-      "| 113_Taxol         |    113 |  3.12 |  4.1  |   12.59 | 15.74 |    nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(df_V100_scf.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 137,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 136,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| mol               |   natm |    LDA |    PBE |   B3LYP |    M06 |   wB97m-v |\n",
-      "|:------------------|-------:|-------:|-------:|--------:|-------:|----------:|\n",
-      "| 020_Vitamin_C     |     20 |   5.02 |   7.04 |   10.55 |   9.28 |     11.11 |\n",
-      "| 031_Inosine       |     31 |   7.3  |  10.03 |   15.12 |  12.62 |     13.9  |\n",
-      "| 033_Bisphenol_A   |     33 |   7.58 |  11.1  |   15.55 |  12.64 |     14    |\n",
-      "| 037_Mg_Porphin    |     37 |   7.47 |  11.34 |   18.05 |  15.81 |     14.85 |\n",
-      "| 042_Penicillin_V  |     42 |   6.03 |   8.96 |   17.4  |  14.47 |     13.81 |\n",
-      "| 045_Ochratoxin_A  |     45 |   7.51 |   9.33 |   19.51 |  17.2  |     14.55 |\n",
-      "| 052_Cetirizine    |     52 |   8.32 |   9.7  |   20.8  |  16.46 |     15.7  |\n",
-      "| 057_Tamoxifen     |     57 |   8.91 |   9.61 |   20.61 |  16.2  |     15    |\n",
-      "| 066_Raffinose     |     66 |   8.52 |   9.46 |   24.2  |  18.63 |     17.13 |\n",
-      "| 084_Sphingomyelin |     84 |   8.51 |   9.49 |   23.62 |  21.63 |     17.66 |\n",
-      "| 095_Azadirachtin  |     95 |   7.69 |   9.48 |   42.24 |  34.01 |     23.93 |\n",
-      "| 113_Taxol         |    113 |   8.08 |   9.05 |   51.03 |  40.13 |    nan    |\n",
-      "| 168_Valinomycin   |    168 | nan    | nan    |  nan    | nan    |    nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(df_A100_grad.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 139,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 138,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| mol               |   natm |   LDA |   PBE |   B3LYP |   M06 |   wB97m-v |\n",
-      "|:------------------|-------:|------:|------:|--------:|------:|----------:|\n",
-      "| 020_Vitamin_C     |     20 |  3.19 |  4.28 |    5.9  |  4.82 |      5.84 |\n",
-      "| 031_Inosine       |     31 |  3.21 |  4.5  |    6.55 |  5.52 |      6.39 |\n",
-      "| 033_Bisphenol_A   |     33 |  3.55 |  4.87 |    6.61 |  5.51 |      6.48 |\n",
-      "| 037_Mg_Porphin    |     37 |  3.19 |  5.2  |    8.32 |  7.26 |      6.81 |\n",
-      "| 042_Penicillin_V  |     42 |  3.15 |  4.35 |    8.11 |  7.23 |      6.97 |\n",
-      "| 045_Ochratoxin_A  |     45 |  3.32 |  4.29 |    8.99 |  8.04 |      6.92 |\n",
-      "| 052_Cetirizine    |     52 |  3.51 |  4.6  |    9.41 |  8.18 |      7.57 |\n",
-      "| 057_Tamoxifen     |     57 |  3.86 |  4.66 |    9.56 |  8.4  |      7.51 |\n",
-      "| 066_Raffinose     |     66 |  3.4  |  4.32 |   10.94 |  9.4  |      8.29 |\n",
-      "| 084_Sphingomyelin |     84 |  3.15 |  3.81 |    9.66 |  8.97 |      8.03 |\n",
-      "| 095_Azadirachtin  |     95 |  3.32 |  4.37 |   18.47 | 16.01 |      1.68 |\n",
-      "| 113_Taxol         |    113 |  3.12 |  1.19 |   22.53 | 16.94 |    nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(df_V100_grad.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Merlin (Python3 + MLSQL) [Spark 3.0]",
-   "language": "python",
-   "name": "merlin_kernel"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "Python3 with MLSQL",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "0.1"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/benchmarks/df/generate_tables.py b/benchmarks/df/generate_tables.py
new file mode 100644
index 00000000..f429db6a
--- /dev/null
+++ b/benchmarks/df/generate_tables.py
@@ -0,0 +1,128 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import pandas as pd
+import numpy as np
+
+# -------------------------------------------
+# |   Density fitting with different basis  |
+# -------------------------------------------
+
+A100_file = 'NVIDIA A100-SXM4-80GB.csv'
+V100_file = 'Tesla V100-SXM2-32GB.csv'
+qchem_file = 'qchem-32-cores-cpu.csv'
+
+keys = ['mol', 'natm']
+empty = {'mol':[], 'natm':[]}
+df_A100_scf = pd.DataFrame(empty)
+df_V100_scf = pd.DataFrame(empty)
+df_A100_grad = pd.DataFrame(empty)
+df_V100_grad = pd.DataFrame(empty)
+path = 'organic/basis/'
+
+for basis in ['sto-3g', '6-31g', 'def2-svp', 'def2-tzvpp', 'def2-tzvpd']:
+    df_qchem = pd.read_csv(path + basis + '/' + qchem_file)
+    df_qchem = df_qchem.rename(columns={'t_scf':'scf_qchem', 't_gradient':'grad_qchem'})
+
+    df_A100 = pd.read_csv(path + basis + '/' + A100_file)
+    df_A100 = df_A100.rename(columns={'t_scf':'scf_A100', 't_gradient':'grad_A100'})
+    df_A100 = df_A100.merge(df_qchem, how='outer', on='mol')
+
+    df_A100['scf_'+basis] = df_A100['scf_qchem']/df_A100['scf_A100']
+    df_A100['grad_'+basis] = df_A100['grad_qchem']/df_A100['grad_A100']
+    df_A100 = df_A100[keys+['scf_'+basis, 'grad_'+basis]]
+
+    df_A100_scf = df_A100_scf.merge(df_A100[keys+['scf_'+basis]], how='outer', on=keys)
+    df_A100_grad= df_A100_grad.merge(df_A100[keys+['grad_'+basis]], how='outer', on=keys)
+    df_A100_scf = df_A100_scf.rename(columns={'scf_'+basis:basis})
+    df_A100_grad = df_A100_grad.rename(columns={'grad_'+basis:basis})
+    df_A100_scf[basis] = df_A100_scf[basis].apply(lambda x: round(x,2))
+    df_A100_grad[basis] = df_A100_grad[basis].apply(lambda x: round(x,2))
+
+    df_V100 = pd.read_csv(path + basis + '/' + V100_file)
+    df_V100 = df_V100.rename(columns={'t_scf':'scf_V100', 't_gradient':'grad_V100'})
+    df_V100 = df_V100.merge(df_qchem, how='outer', on='mol')
+    df_V100['scf_'+basis] = df_V100['scf_qchem']/df_V100['scf_V100']
+    df_V100['grad_'+basis] = df_V100['grad_qchem']/df_V100['grad_V100']
+
+    df_V100_scf = df_V100_scf.merge(df_V100[keys+['scf_'+basis,]], how='outer', on=keys)
+    df_V100_grad= df_V100_grad.merge(df_V100[keys+['grad_'+basis]], how='outer', on=keys)
+    df_V100_scf = df_V100_scf.rename(columns={'scf_'+basis:basis})
+
+    df_V100_grad = df_V100_grad.rename(columns={'grad_'+basis:basis})
+    df_V100_scf[basis] = df_V100_scf[basis].apply(lambda x: round(x,2))
+    df_V100_grad[basis] = df_V100_grad[basis].apply(lambda x: round(x,2))
+
+print("\n============SCF speedup with A100-80G============\n")
+print(df_A100_scf.to_markdown(index=False))
+print("\n============SCF speedup with V100-32G============\n")
+print(df_V100_scf.to_markdown(index=False))
+print("\n============Gradient speedup with A100-80G=======\n")
+print(df_A100_grad.to_markdown(index=False))
+print("\n============Gradient speedup with V100-32G=======\n")
+print(df_V100_grad.to_markdown(index=False))
+
+# -----------------------------------------
+# |   Density fitting with different xc   |
+# -----------------------------------------
+
+keys = ['mol', 'natm']
+empty = {'mol':[], 'natm':[]}
+df_A100_scf = pd.DataFrame(empty)
+df_V100_scf = pd.DataFrame(empty)
+df_A100_grad = pd.DataFrame(empty)
+df_V100_grad = pd.DataFrame(empty)
+path = 'organic/xc/'
+for xc in ['LDA', 'PBE', 'B3LYP', 'M06', 'wB97m-v']:
+    df_qchem = pd.read_csv(path + xc + '/' + qchem_file)
+    df_qchem = df_qchem.rename(columns={'t_scf':'scf_qchem', 't_gradient':'grad_qchem'})
+
+    df_A100 = pd.read_csv(path + xc + '/' + A100_file)
+    df_A100 = df_A100.rename(columns={'t_scf':'scf_A100', 't_gradient':'grad_A100'})
+    df_A100 = df_A100.merge(df_qchem, how='outer', on='mol')
+
+    df_A100['scf_'+xc] = df_A100['scf_qchem']/df_A100['scf_A100']
+    df_A100['grad_'+xc] = df_A100['grad_qchem']/df_A100['grad_A100']
+    df_A100 = df_A100[keys+['scf_'+xc, 'grad_'+xc]]
+
+    df_A100_scf = df_A100_scf.merge(df_A100[keys+['scf_'+xc]], how='outer', on=keys)
+    df_A100_grad= df_A100_grad.merge(df_A100[keys+['grad_'+xc]], how='outer', on=keys)
+    df_A100_scf = df_A100_scf.rename(columns={'scf_'+xc:xc})
+    df_A100_grad = df_A100_grad.rename(columns={'grad_'+xc:xc})
+    df_A100_scf[xc] = df_A100_scf[xc].apply(lambda x: round(x,2))
+    df_A100_grad[xc] = df_A100_grad[xc].apply(lambda x: round(x,2))
+
+    df_V100 = pd.read_csv(path + xc + '/' + V100_file)
+    df_V100 = df_V100.rename(columns={'t_scf':'scf_V100', 't_gradient':'grad_V100'})
+    df_V100 = df_V100.merge(df_qchem, how='outer', on='mol')
+    df_V100['scf_'+xc] = df_V100['scf_qchem']/df_V100['scf_V100']
+    df_V100['grad_'+xc] = df_V100['grad_qchem']/df_V100['grad_V100']
+
+    df_V100_scf = df_V100_scf.merge(df_V100[keys+['scf_'+xc,]], how='outer', on=keys)
+    df_V100_grad= df_V100_grad.merge(df_V100[keys+['grad_'+xc]], how='outer', on=keys)
+    df_V100_scf = df_V100_scf.rename(columns={'scf_'+xc:xc})
+
+    df_V100_grad = df_V100_grad.rename(columns={'grad_'+xc:xc})
+    df_V100_scf[xc] = df_V100_scf[xc].apply(lambda x: round(x,2))
+    df_V100_grad[xc] = df_V100_grad[xc].apply(lambda x: round(x,2))
+
+print("\n============SCF speedup with A100-80G============\n")
+print(df_A100_scf.to_markdown(index=False))
+print("\n============SCF speedup with V100-32G============\n")
+print(df_V100_scf.to_markdown(index=False))
+print("\n============Gradient speedup with A100-80G=======\n")
+print(df_A100_grad.to_markdown(index=False))
+print("\n============Gradient speedup with V100-32G=======\n")
+print(df_V100_grad.to_markdown(index=False))
diff --git a/benchmarks/df/qchem.py b/benchmarks/df/qchem.py
index 236e89d3..61e32cf0 100644
--- a/benchmarks/df/qchem.py
+++ b/benchmarks/df/qchem.py
@@ -1,3 +1,18 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
 import os
 import csv
 import argparse
diff --git a/benchmarks/df/qchem_input.in b/benchmarks/df/qchem_input.in
deleted file mode 100644
index 9f357459..00000000
--- a/benchmarks/df/qchem_input.in
+++ /dev/null
@@ -1,86 +0,0 @@
-$molecule
-0 1
-C                 -1.42958937   -1.01658249    2.44831265
-C                 -1.54149463   -0.27943595    1.12935913
-C                 -2.95243912    1.47129501    2.22492550
-C                 -2.84134324    0.73400280    3.54388810
-C                 -1.55797242   -0.06742235    3.62210684
-H                 -0.66497820    0.41085733    1.01283439
-H                 -0.44314689   -1.54644194    2.49994231
-H                 -3.93813646    2.00251835    2.17350554
-H                 -3.71865156    0.04459988    3.66018043
-H                 -0.68133448    0.63238265    3.63327796
-O                 -1.92499204    2.46279929    2.14636482
-C                 -1.46076301    4.67520648    3.08258818
-C                 -1.65192906    4.60593616    4.58432062
-H                 -0.74117334    5.43388332    2.85565168
-C                 -3.37658473    6.27667868    2.91318631
-C                 -2.26541279    5.88367074    5.11970508
-H                 -2.31747310    3.73866848    4.83580579
-C                 -3.56636040    6.20921206    4.41472333
-H                 -4.36600055    6.45426127    2.41719619
-H                 -1.54350069    6.73075622    4.97913735
-H                 -4.32958469    5.42429404    4.65913754
-O                 -2.52952388    7.38269576    2.59051909
-C                 -3.31900286    8.44866624    2.05635630
-C                 -3.26458455    9.66301263    2.97202724
-C                 -2.80523815   10.85434814    2.14390783
-H                 -4.26911658    9.86120175    3.42267320
-C                 -2.45049436   10.34868118    0.75291070
-H                 -1.92149828   11.34764146    2.62047963
-H                 -3.01626404   10.91739387   -0.02697788
-C                 -4.76356005    7.92754855    1.94107026
-H                 -4.78266313    7.06714393    1.30527236
-H                 -5.12531018    7.66145994    2.91227245
-O                 -2.82580898    0.52071700    1.05150767
-C                 -0.93849427    3.31732760    2.57761263
-H                 -0.77727115    3.36776157    1.52103158
-H                 -0.01653199    3.08491476    3.06839754
-C                 -1.49224313   -1.28985314   -0.03177454
-H                 -0.58145107   -1.84899344    0.02037852
-H                 -2.32526415   -1.95746693    0.04080516
-C                 -0.95163774   10.56175833    0.47072820
-H                 -0.66603955    9.99216789   -0.38886496
-H                 -0.38034325   10.24135666    1.31681648
-O                 -1.55044662   -0.58860344   -1.27666860
-H                 -1.40118565   -1.20385620   -1.99832308
-O                 -2.45542951   -2.00981905    2.52598600
-H                 -2.07814582   -2.83668451    2.83509848
-O                 -1.53066740   -0.81010098    4.84381994
-H                 -0.61998842   -0.98424670    5.09269150
-O                 -2.88646987    1.67226139    4.62209700
-H                 -3.42291955    1.31644411    5.33428785
-O                 -0.39050984    4.37790095    5.21813565
-H                 -0.43907755    4.65272412    6.13667438
-O                 -2.50206790    5.74947404    6.52358741
-H                 -2.24542728    6.55931660    6.97067871
-O                 -4.06906224    7.46048270    4.89065678
-H                 -4.76357327    7.30283533    5.53439721
-O                 -2.76235824    4.99928466    2.37789525
-O                 -5.59594331    8.94915410    1.38576835
-H                 -6.48998576    8.61436213    1.28476127
-O                 -2.79571817    8.86766013    0.68976834
-O                 -2.35757225    9.42976003    4.05268533
-H                 -2.22828280   10.24429373    4.54402949
-O                 -3.84292711   11.83519815    2.06622328
-H                 -3.45759742   12.71343457    2.10890596
-O                 -0.70503591   11.94904429    0.22675035
-H                  0.04258943   12.04182451   -0.36826913
-
-$end
-$rem
-JOBTYPE force
-METHOD LDA
-BASIS def2-tzvpp
-SYMMETRY      FALSE
-SYM_IGNORE    TRUE
-XC_GRID       000099000590
-NL_GRID       000050000194
-MAX_SCF_CYCLES 100
-ri_j        True
-ri_k        True
-aux_basis     RIJK-def2-tzvp
-SCF_CONVERGENCE 9
-THRESH        14
-BASIS_LIN_DEP_THRESH 12
-$end
diff --git a/benchmarks/scf/dft_driver.py b/benchmarks/scf/dft_driver.py
index 5cc05a66..9021bb07 100644
--- a/benchmarks/scf/dft_driver.py
+++ b/benchmarks/scf/dft_driver.py
@@ -1,3 +1,18 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
 import os
 import csv
 import pyscf
diff --git a/benchmarks/scf/generate_tables.ipynb b/benchmarks/scf/generate_tables.ipynb
deleted file mode 100644
index c3eb0dd2..00000000
--- a/benchmarks/scf/generate_tables.ipynb
+++ /dev/null
@@ -1,201 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Direct SCF with different xc"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "|   mol |   natm |    LDA |    PBE |   B3LYP |    M06 |   wB97m-v |\n",
-      "|------:|-------:|-------:|-------:|--------:|-------:|----------:|\n",
-      "|     2 |      3 |   0.22 |   0.32 |    0.27 |   0.25 |      0.69 |\n",
-      "|     3 |     15 |   0.68 |   0.25 |    1.58 |   2.61 |      4.84 |\n",
-      "|     4 |     30 |   1.59 |   2.63 |    4.09 |   6.93 |      8.17 |\n",
-      "|     5 |     60 |   2.86 |   3.64 |    7.15 |   8.44 |      9.44 |\n",
-      "|     6 |     96 |   4.34 |   4.39 |    7.75 |  10.58 |      9.87 |\n",
-      "|     7 |    141 |   4.07 |   4.1  |    8.87 |  10.47 |     10.13 |\n",
-      "|     8 |    228 |   4.34 |   4.58 |    9.39 |  10.48 |      9.36 |\n",
-      "|     9 |    300 |   5.05 |   5.21 |    9.35 |  11.36 |    nan    |\n",
-      "|    10 |    417 |   4.91 | nan    |  nan    | nan    |    nan    |\n",
-      "|    10 |    nan | nan    | nan    |  nan    | nan    |    nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "A100_file = 'A100-SXM-80GB.csv'\n",
-    "qchem_file = 'qchem-32-cores-cpu.csv'\n",
-    "\n",
-    "keys = ['mol', 'natm']\n",
-    "empty = {'mol':[], 'natm':[]}\n",
-    "df_A100_scf = pd.DataFrame(empty)\n",
-    "df_V100_scf = pd.DataFrame(empty)\n",
-    "df_A100_grad = pd.DataFrame(empty)\n",
-    "df_V100_grad = pd.DataFrame(empty)\n",
-    "path = 'water_clusters/xc/'\n",
-    "for xc in ['LDA', 'PBE', 'B3LYP', 'M06', 'wB97m-v']:\n",
-    "    df_qchem = pd.read_csv(path + xc + '/' + qchem_file)\n",
-    "    df_qchem = df_qchem.rename(columns={'t_scf':'scf_qchem', 't_gradient':'grad_qchem'})\n",
-    "    \n",
-    "    df_A100 = pd.read_csv(path + xc + '/' + A100_file)\n",
-    "    df_A100 = df_A100.rename(columns={'t_scf':'scf_A100', 't_gradient':'grad_A100'})\n",
-    "    df_A100 = df_A100.merge(df_qchem, how='outer', on='mol')\n",
-    "    \n",
-    "    df_A100['scf_'+xc] = df_A100['scf_qchem']/df_A100['scf_A100']\n",
-    "    df_A100['grad_'+xc] = df_A100['grad_qchem']/df_A100['grad_A100']\n",
-    "    df_A100 = df_A100[keys+['scf_'+xc, 'grad_'+xc]]\n",
-    "    \n",
-    "    df_A100_scf = df_A100_scf.merge(df_A100[keys+['scf_'+xc]], how='outer', on=keys)\n",
-    "    df_A100_grad= df_A100_grad.merge(df_A100[keys+['grad_'+xc]], how='outer', on=keys)\n",
-    "    df_A100_scf = df_A100_scf.rename(columns={'scf_'+xc:xc})\n",
-    "    df_A100_grad = df_A100_grad.rename(columns={'grad_'+xc:xc})\n",
-    "    df_A100_scf[xc] = df_A100_scf[xc].apply(lambda x: round(x,2))\n",
-    "    df_A100_grad[xc] = df_A100_grad[xc].apply(lambda x: round(x,2))\n",
-    "\n",
-    "print(df_A100_scf.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "''"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "|   mol |   natm |    LDA |    PBE |   B3LYP |    M06 |   wB97m-v |\n",
-      "|------:|-------:|-------:|-------:|--------:|-------:|----------:|\n",
-      "|     2 |      3 |   0.82 |   0.89 |    0.75 |   0.82 |      0.6  |\n",
-      "|     3 |     15 |   0.39 |   0.19 |    1.46 |   1.52 |      1.47 |\n",
-      "|     4 |     30 |   0.56 |   1.04 |    2.07 |   2.25 |      1.89 |\n",
-      "|     5 |     60 |   0.54 |   0.87 |    2.42 |   2.4  |      1.77 |\n",
-      "|     6 |     96 |   0.6  |   0.87 |    2.36 |   2.51 |      1.53 |\n",
-      "|     7 |    141 |   0.93 |   1.1  |    2.61 |   2.59 |      1.55 |\n",
-      "|     8 |    228 |   1.92 |   1.9  |    3.37 |   3.39 |      1.83 |\n",
-      "|     9 |    300 |   2.26 |   2.02 |    3.06 |   3.59 |    nan    |\n",
-      "|    10 |    417 |   2.46 | nan    |  nan    | nan    |    nan    |\n",
-      "|    10 |    nan | nan    | nan    |  nan    | nan    |    nan    |\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(df_A100_grad.to_markdown(index=False))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Merlin (Python3 + MLSQL) [Spark 3.0]",
-   "language": "python",
-   "name": "merlin_kernel"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.2"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/benchmarks/scf/generate_tables.py b/benchmarks/scf/generate_tables.py
new file mode 100644
index 00000000..4bfc563b
--- /dev/null
+++ b/benchmarks/scf/generate_tables.py
@@ -0,0 +1,55 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import pandas as pd
+import numpy as np
+
+# -------------------------------------------
+# |   Density fitting with different xc     |
+# -------------------------------------------
+
+A100_file = 'A100-SXM-80GB.csv'
+qchem_file = 'qchem-32-cores-cpu.csv'
+
+keys = ['mol', 'natm']
+empty = {'mol':[], 'natm':[]}
+df_A100_scf = pd.DataFrame(empty)
+df_V100_scf = pd.DataFrame(empty)
+df_A100_grad = pd.DataFrame(empty)
+df_V100_grad = pd.DataFrame(empty)
+path = 'water_clusters/xc/'
+for xc in ['LDA', 'PBE', 'B3LYP', 'M06', 'wB97m-v']:
+    df_qchem = pd.read_csv(path + xc + '/' + qchem_file)
+    df_qchem = df_qchem.rename(columns={'t_scf':'scf_qchem', 't_gradient':'grad_qchem'})
+
+    df_A100 = pd.read_csv(path + xc + '/' + A100_file)
+    df_A100 = df_A100.rename(columns={'t_scf':'scf_A100', 't_gradient':'grad_A100'})
+    df_A100 = df_A100.merge(df_qchem, how='outer', on='mol')
+
+    df_A100['scf_'+xc] = df_A100['scf_qchem']/df_A100['scf_A100']
+    df_A100['grad_'+xc] = df_A100['grad_qchem']/df_A100['grad_A100']
+    df_A100 = df_A100[keys+['scf_'+xc, 'grad_'+xc]]
+
+    df_A100_scf = df_A100_scf.merge(df_A100[keys+['scf_'+xc]], how='outer', on=keys)
+    df_A100_grad= df_A100_grad.merge(df_A100[keys+['grad_'+xc]], how='outer', on=keys)
+    df_A100_scf = df_A100_scf.rename(columns={'scf_'+xc:xc})
+    df_A100_grad = df_A100_grad.rename(columns={'grad_'+xc:xc})
+    df_A100_scf[xc] = df_A100_scf[xc].apply(lambda x: round(x,2))
+    df_A100_grad[xc] = df_A100_grad[xc].apply(lambda x: round(x,2))
+
+print("\n============SCF speedup with A100-80G============\n")
+print(df_A100_scf.to_markdown(index=False))
+print("\n============Gradient speedup with A100-80G=======\n")
+print(df_A100_grad.to_markdown(index=False))
\ No newline at end of file
diff --git a/benchmarks/scf/qchem.py b/benchmarks/scf/qchem.py
index fe5eeb39..8e49b764 100644
--- a/benchmarks/scf/qchem.py
+++ b/benchmarks/scf/qchem.py
@@ -1,3 +1,18 @@
+# Copyright 2023 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
 import os
 import csv
 import argparse
@@ -45,15 +60,15 @@ def run_dft(filename):
         input.write("INCDFT_GRIDDIFF_THRESH   14\n")
         input.write("BASIS_LIN_DEP_THRESH 12\n")
         input.write("$end\n")
-    
-    import tempfile 
-    temp = tempfile.NamedTemporaryFile() 
+
+    import tempfile
+    temp = tempfile.NamedTemporaryFile()
     filename = temp.name
 
-    import os 
+    import os
     print(f'creating a temp file named {filename}')
     os.system('qchem -nt 32 qchem_input.in > ' + filename)
-    
+
     with open(filename, 'r') as output_file:
         lines = output_file.readlines()
         for line in lines:
diff --git a/examples/00-h2o.py b/examples/00-h2o.py
index 7df44258..5ac37b6f 100644
--- a/examples/00-h2o.py
+++ b/examples/00-h2o.py
@@ -32,9 +32,9 @@
 screen_tol = 1e-14
 grids_level = 3
 
-mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
+mol = pyscf.M(atom=atom, basis=bas, max_memory=32000, output='./pyscf.log')
 
-mol.verbose = 6
+mol.verbose = 4
 mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
 mf_GPU.grids.level = grids_level
 mf_GPU.conv_tol = scf_tol
diff --git a/examples/03-h2o_dzvp.py b/examples/03-h2o_dzvp.py
index e95f167c..044d65b5 100644
--- a/examples/03-h2o_dzvp.py
+++ b/examples/03-h2o_dzvp.py
@@ -19,7 +19,7 @@
 from gpu4pyscf.dft import rks
 lib.num_threads(8)
 
-atom =''' 
+atom ='''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
 H       0.7570000000     0.0000000000    -0.4696000000
@@ -35,7 +35,7 @@
 mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
 
 auxbasis = {}
-from pyscf import df, gto
+from pyscf import df
 etb_basis = df.aug_etb(mol, beta=2.0)
 for i in range(mol.natm):
     sym = mol.atom_pure_symbol(i)
@@ -57,7 +57,7 @@
 
 # Compute Energy
 print('------------------- Energy -----------------------------')
-e_dft = mf_GPU.kernel() 
+e_dft = mf_GPU.kernel()
 print('DFT energy by GPU4PySCF')
 print(e_dft)
 
diff --git a/examples/09-mbis.py b/examples/09-mbis.py
index 73aaf637..95512d50 100644
--- a/examples/09-mbis.py
+++ b/examples/09-mbis.py
@@ -19,7 +19,7 @@
 from gpu4pyscf.dft import rks
 lib.num_threads(8)
 
-atom =''' 
+atom ='''
 O       0.0000000000    -0.0000000000     0.1174000000
 H      -0.7570000000    -0.0000000000    -0.4696000000
 H       0.7570000000     0.0000000000    -0.4696000000
@@ -60,5 +60,4 @@
 atnums = np.array(atom_list)
 atcoords = mol.atom_coords(unit='B')
 
-np.savez('density.npz', points=points, weights=weights, \
-    density=density, atnums=atnums, atcoords=atcoords)
\ No newline at end of file
+np.savez('density.npz', points=points, weights=weights, density=density, atnums=atnums, atcoords=atcoords)
\ No newline at end of file
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
index 4d81bc03..0a478cc2 100644
--- a/gpu4pyscf/__config__.py
+++ b/gpu4pyscf/__config__.py
@@ -20,9 +20,9 @@
     number_of_threads = 1024 * 80
 # such as A30-24GB
 elif props['totalGlobalMem'] >= 16 * GB:
-    min_ao_blksize = 64
-    min_grid_blksize = 64*64
-    ao_aligned = 16
+    min_ao_blksize = 128
+    min_grid_blksize = 128*128
+    ao_aligned = 32
     grid_aligned = 128
     mem_fraction = 0.9
     number_of_threads = 1024 * 80
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
index 550ac39c..7ed8fa1c 100644
--- a/gpu4pyscf/__init__.py
+++ b/gpu4pyscf/__init__.py
@@ -1,2 +1,6 @@
 from . import lib, grad, hessian, solvent, scf, dft
-__version__ = '0.6.6'
+__version__ = '0.6.9'
+
+# monkey patch libxc reference due to a bug in nvcc
+from pyscf.dft import libxc
+libxc.__reference__ = 'unable to decode the reference due to https://github.com/NVIDIA/cuda-python/issues/29'
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
index da59f9da..76a14f28 100644
--- a/gpu4pyscf/df/df.py
+++ b/gpu4pyscf/df/df.py
@@ -22,7 +22,7 @@
 from pyscf import lib
 from pyscf.df import df, addons
 from gpu4pyscf.lib.cupy_helper import (
-    cholesky, tag_array, get_avail_mem, cart2sph)
+    cholesky, tag_array, get_avail_mem, cart2sph, take_last2d)
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -77,13 +77,13 @@ def build(self, direct_scf_tol=1e-14, omega=None):
                 j2c_cpu = auxmol.intor('int2c2e', hermi=1)
         else:
             j2c_cpu = auxmol.intor('int2c2e', hermi=1)
-        j2c = cupy.asarray(j2c_cpu)
+        j2c = cupy.asarray(j2c_cpu, order='C')
         t0 = log.timer_debug1('2c2e', *t0)
         intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
         intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=256)
         log.timer_debug1('prepare intopt', *t0)
         self.j2c = j2c.copy()
-        j2c = j2c[cupy.ix_(intopt.sph_aux_idx, intopt.sph_aux_idx)]
+        j2c = take_last2d(j2c, intopt.sph_aux_idx)
         try:
             self.cd_low = cholesky(j2c)
             self.cd_low = tag_array(self.cd_low, tag='cd')
diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
index febdee40..69231337 100644
--- a/gpu4pyscf/df/grad/rhf.py
+++ b/gpu4pyscf/df/grad/rhf.py
@@ -20,7 +20,7 @@
 from pyscf.df.grad import rhf
 from pyscf import lib, scf, gto
 from gpu4pyscf.df import int3c2e
-from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library
+from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library, take_last2d
 from gpu4pyscf.grad.rhf import grad_elec
 from gpu4pyscf import __config__
 from gpu4pyscf.lib import logger
@@ -59,7 +59,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
     mo_coeff = cupy.asarray(mf_grad.base.mo_coeff)
     mo_occ = cupy.asarray(mf_grad.base.mo_occ)
     sph_ao_idx = intopt.sph_ao_idx
-    dm = dm0[numpy.ix_(sph_ao_idx, sph_ao_idx)]
+    dm = take_last2d(dm0, sph_ao_idx)
     orbo = contract('pi,i->pi', mo_coeff[:,mo_occ>0], numpy.sqrt(mo_occ[mo_occ>0]))
     orbo = orbo[sph_ao_idx, :]
     nocc = orbo.shape[-1]
@@ -119,7 +119,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
             #rhok = solve_triangular(low_t, rhok, lower=False)
             rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc)
         tmp = contract('pij,qij->pq', rhok, rhok)
-        tmp = tmp[cupy.ix_(rev_aux_idx, rev_aux_idx)]
+        tmp = take_last2d(tmp, rev_aux_idx)
         vkaux = -contract('xpq,pq->xp', int2c_e1, tmp)
         vkaux_2c = cupy.array([-vkaux[:,p0:p1].sum(axis=1) for p0, p1 in auxslices[:,2:]])
         vkaux = tmp = None
diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
index 1141de1f..710963a9 100644
--- a/gpu4pyscf/df/hessian/rhf.py
+++ b/gpu4pyscf/df/hessian/rhf.py
@@ -38,7 +38,7 @@
 import numpy as np
 from pyscf import lib, df
 from gpu4pyscf.hessian import rhf as rhf_hess
-from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info
+from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info, take_last2d
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib import logger
 
@@ -65,7 +65,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     if mo_coeff is None:  mo_coeff = mf.mo_coeff
     if atmlst is None: atmlst = range(mol.natm)
 
-    mo_coeff = cupy.asarray(mo_coeff)
+    mo_coeff = cupy.asarray(mo_coeff, order='C')
     nao, nmo = mo_coeff.shape
     mocc = mo_coeff[:,mo_occ>0]
     mocc_2 = cupy.einsum('pi,i->pi', mocc, mo_occ[mo_occ>0]**.5)
@@ -77,8 +77,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #      overlap matrix contributions
     # ------------------------------------
     s1aa, s1ab, _ = rhf_hess.get_ovlp(mol)
-    s1aa = cupy.asarray(s1aa)
-    s1ab = cupy.asarray(s1ab)
+    s1aa = cupy.asarray(s1aa, order='C')
+    s1ab = cupy.asarray(s1ab, order='C')
 
     auxmol = df.addons.make_auxmol(mol, auxbasis=mf.with_df.auxbasis)
     naux = auxmol.nao
@@ -101,15 +101,15 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     sph_aux_idx = intopt.sph_aux_idx
 
     mocc_2 = mocc_2[sph_ao_idx, :]
-    dm0 = dm0[cupy.ix_(sph_ao_idx, sph_ao_idx)]
+    dm0 = take_last2d(dm0, sph_ao_idx)
     dm0_tag = tag_array(dm0, occ_coeff=mocc_2)
 
-    int2c = cupy.asarray(int2c)
-    int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)]
+    int2c = cupy.asarray(int2c, order='C')
+    int2c = take_last2d(int2c, sph_aux_idx)
     int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12)
 
-    int2c_ip1 = cupy.asarray(int2c_ip1)
-    int2c_ip1 = int2c_ip1[cupy.ix_(np.arange(3), sph_aux_idx, sph_aux_idx)]
+    int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
+    int2c_ip1 = take_last2d(int2c_ip1, sph_aux_idx)
     int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv)
 
     hj_ao_ao = cupy.zeros([nao,nao,3,3])
@@ -188,46 +188,48 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     cupy.get_default_memory_pool().free_all_blocks()
     #  int3c_ipip1 contributions
-    hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_ipip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega)
+    hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_ipip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
     hj_ao_diag *= 2.0
     t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1)
 
     #  int3c_ipvip1 contributions
     # (11|0), (0|00) without response of RI basis
-    hj, hk = int3c2e.get_int3c2e_ipvip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega)
+    hj, hk = int3c2e.get_int3c2e_ipvip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
     hj_ao_ao += 2.0*hj
-    hk_ao_ao += hk
+    if with_k:
+        hk_ao_ao += hk
     hj = hk = None
     t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1)
 
     #  int3c_ip1ip2 contributions
     # (10|1), (0|0)(0|00)
     if hessobj.auxbasis_response:
-        hj, hk = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega)
+        hj, hk = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
         hj_ao_aux += hj
-        hk_ao_aux += hk
+        if with_k:
+            hk_ao_aux += hk
         hj = hk = None
         t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1)
 
     #  int3c_ipip2 contributions
     if hessobj.auxbasis_response > 1:
         # (00|2), (0|0)(0|00)
-        hj, hk = int3c2e.get_int3c2e_ipip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega)
+        hj, hk = int3c2e.get_int3c2e_ipip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
         hj_aux_diag = hj
-        hk_aux_diag = .5*hk
+        if with_k:
+            hk_aux_diag = .5*hk
         hj = hk = None
         t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1)
 
     # int2c contributions
     if hessobj.auxbasis_response > 1:
-        aux_aux_9 = np.ix_(np.arange(9), sph_aux_idx, sph_aux_idx)
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
                 int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         else:
             int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
-        int2c_ipip1 = cupy.asarray(int2c_ipip1)
-        int2c_ipip1 = int2c_ipip1[aux_aux_9]
+        int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C')
+        int2c_ipip1 = take_last2d(int2c_ipip1, sph_aux_idx)
         rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P)
         # (00|0)(2|0)(0|00)
         hj_aux_diag -= cupy.einsum('p,xp->px', rhoj0_P, rhoj2c_P).reshape(-1,3,3)
@@ -241,13 +243,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                 int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
         else:
             int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1')
-        int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2)
-        int2c_ip1ip2 = int2c_ip1ip2[aux_aux_9]
+        int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C')
+        int2c_ip1ip2 = take_last2d(int2c_ip1ip2, sph_aux_idx)
         hj_aux_aux = -.5 * cupy.einsum('p,xpq,q->pqx', rhoj0_P, int2c_ip1ip2, rhoj0_P).reshape(naux, naux,3,3)
         if with_k:
             hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3)
         t1 = log.timer_debug1('intermediate variables with int2c_*', *t1)
-        int2c_ip1ip2 = aux_aux_9 = None
+        int2c_ip1ip2 = None
 
     cupy.get_default_memory_pool().free_all_blocks()
     release_gpu_stack()
@@ -399,8 +401,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         atmlst = range(mol.natm)
     # FIXME
     with_k = True
-    mo_coeff = cupy.asarray(mo_coeff)
-    mo_occ = cupy.asarray(mo_occ)
+    mo_coeff = cupy.asarray(mo_coeff, order='C')
+    mo_occ = cupy.asarray(mo_occ, order='C')
 
     mf = hessobj.base
     #auxmol = hessobj.base.with_df.auxmol
@@ -417,7 +419,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             int2c = auxmol.intor('int2c2e', aosym='s1')
     else:
         int2c = auxmol.intor('int2c2e', aosym='s1')
-    int2c = cupy.asarray(int2c)
+    int2c = cupy.asarray(int2c, order='C')
     # ======================= sorted AO begin ======================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=BLKSIZE, group_size=BLKSIZE)
@@ -427,10 +429,10 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     mocc = mocc[sph_ao_idx, :]
     mo_coeff = mo_coeff[sph_ao_idx,:]
-    dm0 = dm0[cupy.ix_(sph_ao_idx, sph_ao_idx)]
+    dm0 = take_last2d(dm0, sph_ao_idx)
     dm0_tag = tag_array(dm0, occ_coeff=mocc)
 
-    int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)]
+    int2c = take_last2d(int2c, sph_aux_idx)
     int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12)
 
     wj, wk_Pl_, wk_P__ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega)
@@ -448,8 +450,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
 
     # int3c_ip1 contributions
     vj1_buf, vk1_buf, vj1_ao, vk1_ao = int3c2e.get_int3c2e_ip1_vjk(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega)
-    vj1_buf = vj1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)]
-    vk1_buf = vk1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)]
+    vj1_buf = take_last2d(vj1_buf, rev_ao_idx)
+    vk1_buf = take_last2d(vk1_buf, rev_ao_idx)
 
     vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff)
     vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff)
@@ -469,8 +471,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
                 int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
         else:
             int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
-        int2c_ip1 = cupy.asarray(int2c_ip1)
-        int2c_ip1 = int2c_ip1[cupy.ix_(np.arange(3), sph_aux_idx, sph_aux_idx)]
+        int2c_ip1 = cupy.asarray(int2c_ip1, order='C')
+        int2c_ip1 = take_last2d(int2c_ip1, sph_aux_idx)
 
         wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0)
         wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__)
@@ -524,7 +526,7 @@ def _ao2mo(mat):
             vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1)
 
         h1 = hcore_deriv(ia)
-        h1 = _ao2mo(cupy.asarray(h1))
+        h1 = _ao2mo(cupy.asarray(h1, order='C'))
         vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao)
         if with_k:
             vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao)
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
index f3a43442..dda6cd50 100644
--- a/gpu4pyscf/df/int3c2e.py
+++ b/gpu4pyscf/df/int3c2e.py
@@ -22,7 +22,7 @@
 from pyscf.scf import _vhf
 from gpu4pyscf.scf.hf import BasisProdCache, _make_s_index_offsets
 from gpu4pyscf.lib.cupy_helper import (
-    block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l, get_avail_mem, print_mem_info)
+    block_c2s_diag, cart2sph, block_diag, contract, load_library, c2s_l, get_avail_mem, print_mem_info, take_last2d)
 from gpu4pyscf.lib import logger
 
 LMAX_ON_GPU = 8
@@ -873,15 +873,19 @@ def get_int3c2e_ipip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
     nao_sph = dm0_tag.shape[0]
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     hj = cupy.zeros([nao_sph,9])
-    hk = cupy.zeros([nao_sph,9])
+    hk = None
+    if with_k:
+        hk = cupy.zeros([nao_sph,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1', omega=omega):
-        rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-        rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
         tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
         hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1])
-        hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
+        if with_k:
+            rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
+            rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
+            hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp)
     hj = hj.reshape([nao_sph,3,3])
-    hk = hk.reshape([nao_sph,3,3])
+    if with_k:
+        hk = hk.reshape([nao_sph,3,3])
     return hj, hk
 
 def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
@@ -891,15 +895,19 @@ def get_int3c2e_ipvip1_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None)
     nao_sph = dm0_tag.shape[0]
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     hj = cupy.zeros([nao_sph,nao_sph,9])
-    hk = cupy.zeros([nao_sph,nao_sph,9])
+    hk = None
+    if with_k:
+        hk = cupy.zeros([nao_sph,nao_sph,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1', omega=omega):
-        rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-        rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1])
         tmp = contract('xpji,ij->xpij', int3c_blk, dm0_tag[i0:i1,j0:j1])
         hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1])
-        hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp)
+        if with_k:
+            rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
+            rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1])
+            hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp)
     hj = hj.reshape([nao_sph,nao_sph,3,3])
-    hk = hk.reshape([nao_sph,nao_sph,3,3])
+    if with_k:
+        hk = hk.reshape([nao_sph,nao_sph,3,3])
     return hj, hk
 
 def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
@@ -910,15 +918,19 @@ def get_int3c2e_ip1ip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None)
     naux_sph = rhok.shape[0]
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     hj = cupy.zeros([nao_sph,naux_sph,9])
-    hk = cupy.zeros([nao_sph,naux_sph,9])
+    hk = None
+    if with_k:
+        hk = cupy.zeros([nao_sph,naux_sph,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1ip2', omega=omega):
-        rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
-        rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
         tmp = contract('xpji,ij->xpi', int3c_blk, dm0_tag[i0:i1,j0:j1])
         hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1])
-        hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
+        if with_k:
+            rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1])
+            rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1])
+            hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp)
     hj = hj.reshape([nao_sph,naux_sph,3,3])
-    hk = hk.reshape([nao_sph,naux_sph,3,3])
+    if with_k:
+        hk = hk.reshape([nao_sph,naux_sph,3,3])
     return hj, hk
 
 def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
@@ -928,15 +940,19 @@ def get_int3c2e_ipip2_hjk(intopt, rhoj, rhok, dm0_tag, with_k=True, omega=None):
     naux_sph = rhok.shape[0]
     orbo = cupy.asarray(dm0_tag.occ_coeff, order='C')
     hj = cupy.zeros([naux_sph,9])
-    hk = cupy.zeros([naux_sph,9])
+    hk = None
+    if with_k:
+        hk = cupy.zeros([naux_sph,9])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip2', omega=omega):
-        rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1])
-        rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1])
         tmp = contract('xpji,ij->xp', int3c_blk, dm0_tag[i0:i1,j0:j1])
         hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1])
-        hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp)
+        if with_k:
+            rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1])
+            rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1])
+            hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp)
     hj = hj.reshape([naux_sph,3,3])
-    hk = hk.reshape([naux_sph,3,3])
+    if with_k:
+        hk = hk.reshape([naux_sph,3,3])
     return hj, hk
 
 def get_int3c2e_ip_slice(intopt, cp_aux_id, ip_type, out=None, omega=None, stream=None):
@@ -1167,8 +1183,8 @@ def get_dh1e(mol, dm0):
     fakemol = gto.fakemol_for_charges(coords)
     intopt = VHFOpt(mol, fakemol, 'int2e')
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    dm0_sorted = dm0[cupy.ix_(intopt.sph_ao_idx, intopt.sph_ao_idx)]
-
+    #dm0_sorted = dm0[cupy.ix_(intopt.sph_ao_idx, intopt.sph_ao_idx)]
+    dm0_sorted = take_last2d(dm0, intopt.sph_ao_idx)
     dh1e = cupy.zeros([natm,3])
     for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1'):
         dh1e[k0:k1,:3] += cupy.einsum('xkji,ij->kx', int3c_blk, dm0_sorted[i0:i1,j0:j1])
diff --git a/gpu4pyscf/df/tests/test_df_ecp.py b/gpu4pyscf/df/tests/test_df_ecp.py
index e6614d53..3f73296b 100644
--- a/gpu4pyscf/df/tests/test_df_ecp.py
+++ b/gpu4pyscf/df/tests/test_df_ecp.py
@@ -51,11 +51,19 @@ def run_dft(xc):
     e_dft = mf.kernel()
     return e_dft
 
+def _make_mf():
+    mf = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
+    mf.conv_tol = 1e-12
+    mf.grids.level = grids_level
+    mf.verbose = 1
+    mf.kernel()
+    return mf
+
 def _check_grad(grid_response=False, tol=1e-5):
     mf = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
     mf.grids.level = grids_level
     mf.conv_tol = 1e-12
-    e_tot = mf.kernel()
+    mf.kernel()
     g = mf.nuc_grad_method()
     g.auxbasis_response = True
     g.grid_response = grid_response
@@ -85,22 +93,14 @@ def _check_grad(grid_response=False, tol=1e-5):
             mol.set_geom_(coords, unit='Bohr')
             grad_fd[i,j] = (e0-e1)/2.0/eps
     grad_fd = np.array(grad_fd).reshape(-1,3)
-    print('finite difference gradient:')
-    print(grad_fd)
-    print('difference between analytical and finite difference gradient:', np.linalg.norm(g_analy - grad_fd))
+    print('norm of analytical - finite difference gradient:', np.linalg.norm(g_analy - grad_fd))
     assert(np.linalg.norm(g_analy - grad_fd) < tol)
 
 
-def _check_dft_hessian(disp=None, ix=0, iy=0, tol=1e-3):
-    pmol = mol.copy()
+def _check_dft_hessian(mf, h, ix=0, iy=0, tol=1e-3):
+    pmol = mf.mol.copy()
     pmol.build()
 
-    mf = rks.RKS(pmol, xc=xc).density_fit(auxbasis=auxbasis)
-    mf.conv_tol = 1e-12
-    mf.grids.level = grids_level
-    mf.verbose = 1
-    mf.kernel()
-    
     g = mf.nuc_grad_method()
     g.auxbasis_response = True
     g.kernel()
@@ -112,31 +112,19 @@ def _check_dft_hessian(disp=None, ix=0, iy=0, tol=1e-3):
     pmol.set_geom_(coords + v, unit='Bohr')
     pmol.build()
     _, g0 = g_scanner(pmol)
-    
+
     pmol.set_geom_(coords - v, unit='Bohr')
     pmol.build()
     _, g1 = g_scanner(pmol)
-    
+
     h_fd = (g0 - g1)/2.0/eps
     pmol.set_geom_(coords, unit='Bohr')
     pmol.build()
 
-    mf = rks.RKS(pmol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis)
-    mf.grids.level = grids_level
-    mf.kernel()
-    hobj = mf.Hessian()
-    hobj.set(auxbasis_response=2)
-    hobj.verbose=0
-    h = hobj.kernel()
-    
-    print(f"analytical Hessian H({ix},{iy})")
-    print(h[ix,:,iy,:])
-    print(f"finite different Hessian H({ix},{iy})")
-    print(h_fd)
-    print('Norm of diff', np.linalg.norm(h[ix,:,iy,:] - h_fd))
+    print(f'Norm of finite difference - analytical Hessian({ix},{iy})', np.linalg.norm(h[ix,:,iy,:] - h_fd))
     assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol)
 
-class KnownValues(unittest.TestCase):    
+class KnownValues(unittest.TestCase):
     def test_rks_scf(self):
         mf = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
         mf.conv_tol = 1e-12
@@ -145,10 +133,16 @@ def test_rks_scf(self):
 
     def test_rks_gradient(self):
         _check_grad()
-    
+
     def test_rks_hessian(self):
-        _check_dft_hessian(ix=0, iy=0)
-        _check_dft_hessian(ix=0, iy=1)
+        mf = _make_mf()
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        hobj.verbose=0
+        h = hobj.kernel()
+
+        _check_dft_hessian(mf, h, ix=0, iy=0)
+        _check_dft_hessian(mf, h, ix=0, iy=1)
 
 if __name__ == "__main__":
     print("Full Tests for DF ECP")
diff --git a/gpu4pyscf/df/tests/test_df_grad.py b/gpu4pyscf/df/tests/test_df_grad.py
index 9156c2c0..4db5cf58 100644
--- a/gpu4pyscf/df/tests/test_df_grad.py
+++ b/gpu4pyscf/df/tests/test_df_grad.py
@@ -61,7 +61,7 @@ def _check_grad(grid_response=False, xc=xc0, disp=disp0, tol=1e-6):
     mf.nlcgrids.level = nlcgrids_level
     mf.conv_tol = 1e-10
     mf.verbose = 1
-    e_tot = mf.kernel()
+    mf.kernel()
 
     g = mf.nuc_grad_method()
     g.auxbasis_response = True
diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py
index 0b0d3f20..18b55c80 100644
--- a/gpu4pyscf/df/tests/test_df_hessian.py
+++ b/gpu4pyscf/df/tests/test_df_hessian.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import pyscf
-from pyscf import scf, dft
 from gpu4pyscf import dft, scf
 import unittest
 
@@ -40,21 +39,32 @@ def setUpModule():
 
 def tearDownModule():
     global mol
+    mol.stdout.close()
     del mol
 
-def _check_rhf_hessian(ix=0, iy=0, tol=1e-3):
-    pmol = mol.copy()
-    pmol.build()
-    
-    mf = scf.RHF(pmol).density_fit(auxbasis='ccpvtz-jkfit')
+def _make_rhf():
+    mf = scf.RHF(mol).density_fit(auxbasis='ccpvtz-jkfit')
+    mf.conv_tol = 1e-12
+    mf.kernel()
+    return mf
+
+def _make_rks(xc, disp=None):
+    mf = dft.rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis0)
     mf.conv_tol = 1e-12
+    mf.disp = disp
+    mf.grids.level = grids_level
+    mf.verbose = 1
     mf.kernel()
+    return mf
+
+def _check_rhf_hessian(mf, h, ix=0, iy=0, tol=1e-3):
+    pmol = mf.mol.copy()
+    pmol.build()
+
     g = mf.nuc_grad_method()
     g.kernel()
     g_scanner = g.as_scanner()
-    hobj = mf.Hessian()
-    hobj.set(auxbasis_response=2)
-    
+
     coords = pmol.atom_coords()
     v = np.zeros_like(coords)
     v[ix,iy] = eps
@@ -67,25 +77,13 @@ def _check_rhf_hessian(ix=0, iy=0, tol=1e-3):
     _, g1 = g_scanner(pmol)
     h_fd = (g0 - g1)/2.0/eps
 
-    pmol.set_geom_(coords, unit='Bohr')
-    h = hobj.kernel()
-    print(f"analytical Hessian H({ix},{iy})")
-    print(h[ix,:,iy,:])
-    print(f"finite different Hessian H({ix},{iy})")
-    print(h_fd)
-    print('Norm of diff', np.linalg.norm(h[ix,:,iy,:] - h_fd))
+    print(f'Norm of analytical - finite difference Hessian H({ix},{iy})', np.linalg.norm(h[ix,:,iy,:] - h_fd))
     assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol)
 
-def _check_dft_hessian(xc=xc0, disp=None, ix=0, iy=0, tol=1e-3):
-    pmol = mol.copy()
+def _check_dft_hessian(mf, h, ix=0, iy=0, tol=1e-3):
+    pmol = mf.mol.copy()
     pmol.build()
 
-    mf = dft.rks.RKS(pmol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0)
-    mf.conv_tol = 1e-12
-    mf.grids.level = grids_level
-    mf.verbose = 1
-    mf.kernel()
-    
     g = mf.nuc_grad_method()
     g.auxbasis_response = True
     g.kernel()
@@ -101,57 +99,67 @@ def _check_dft_hessian(xc=xc0, disp=None, ix=0, iy=0, tol=1e-3):
     pmol.set_geom_(coords - v, unit='Bohr')
     pmol.build()
     _, g1 = g_scanner(pmol)
-    
+
     h_fd = (g0 - g1)/2.0/eps
-    pmol.set_geom_(coords, unit='Bohr')
-    pmol.build()
 
-    mf = dft.rks.RKS(pmol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0)
-    mf.grids.level = grids_level
-    mf.kernel()
-    hobj = mf.Hessian()
-    hobj.set(auxbasis_response=2)
-    hobj.verbose=0
-    h = hobj.kernel()
-    
-    print(f"analytical Hessian H({ix},{iy})")
-    print(h[ix,:,iy,:])
-    print(f"finite different Hessian H({ix},{iy})")
-    print(h_fd)
-    print('Norm of diff', np.linalg.norm(h[ix,:,iy,:] - h_fd))
+    print(f'Norm of analytical - finite difference Hessian H({ix},{iy})', np.linalg.norm(h[ix,:,iy,:] - h_fd))
     assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol)
-    
+
 class KnownValues(unittest.TestCase):
     def test_hessian_rhf(self):
         print('-----testing DF RHF Hessian----')
-        _check_rhf_hessian(ix=0, iy=0)
-        _check_rhf_hessian(ix=0, iy=1)
-    
+        mf = _make_rhf()
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_rhf_hessian(mf, h, ix=0, iy=0)
+        _check_rhf_hessian(mf, h, ix=0, iy=1)
+
     def test_hessian_lda(self):
         print('-----testing DF LDA Hessian----')
-        _check_dft_hessian(xc='LDA', disp=None, ix=0,iy=0)
-        _check_dft_hessian(xc='LDA', disp=None, ix=0,iy=1)
-    
+        mf = _make_rks('LDA')
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+        _check_dft_hessian(mf, h, ix=0,iy=1)
+
     def test_hessian_gga(self):
         print('-----testing DF PBE Hessian----')
-        _check_dft_hessian(xc='PBE', disp=None, ix=0,iy=0)
-        _check_dft_hessian(xc='PBE', disp=None, ix=0,iy=1)
-    
+        mf = _make_rks('PBE')
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+        _check_dft_hessian(mf, h, ix=0,iy=1)
+
     def test_hessian_hybrid(self):
         print('-----testing DF B3LYP Hessian----')
-        _check_dft_hessian(xc='B3LYP', disp=None, ix=0,iy=0)
-        _check_dft_hessian(xc='B3LYP', disp=None, ix=0,iy=1)
+        mf = _make_rks('b3lyp')
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+        _check_dft_hessian(mf, h, ix=0,iy=1)
 
     def test_hessian_mgga(self):
         print('-----testing DF M06 Hessian----')
-        _check_dft_hessian(xc='m06', disp=None, ix=0,iy=0)
-        _check_dft_hessian(xc='m06', disp=None, ix=0,iy=1)
-    
+        mf = _make_rks('m06')
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+        _check_dft_hessian(mf, h, ix=0,iy=1)
+
     def test_hessian_rsh(self):
         print('-----testing DF wb97 Hessian----')
-        _check_dft_hessian(xc='wb97', disp=None, ix=0,iy=0)
-        _check_dft_hessian(xc='wb97', disp=None, ix=0,iy=1)
-    
+        mf = _make_rks('wb97')
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+        _check_dft_hessian(mf, h, ix=0,iy=1)
+
     def test_hessian_D3(self):
         pmol = mol.copy()
         pmol.build()
@@ -161,11 +169,11 @@ def test_hessian_D3(self):
         mf.grids.level = grids_level
         mf.verbose = 1
         mf.kernel()
-        
+
         hobj = mf.Hessian()
         hobj.set(auxbasis_response=2)
         hobj.verbose=0
-        h = hobj.kernel()
+        hobj.kernel()
 
     def test_hessian_D4(self):
         pmol = mol.copy()
@@ -180,7 +188,7 @@ def test_hessian_D4(self):
         hobj = mf.Hessian()
         hobj.set(auxbasis_response=2)
         hobj.verbose=0
-        h = hobj.kernel()
+        hobj.kernel()
 
 if __name__ == "__main__":
     print("Full Tests for DF Hessian")
diff --git a/gpu4pyscf/df/tests/test_geomopt.py b/gpu4pyscf/df/tests/test_geomopt.py
index 4960278f..e9df65c9 100644
--- a/gpu4pyscf/df/tests/test_geomopt.py
+++ b/gpu4pyscf/df/tests/test_geomopt.py
@@ -15,7 +15,6 @@
 
 import pyscf
 import numpy as np
-import dftd3.pyscf as disp
 import unittest
 from pyscf import lib
 from gpu4pyscf import scf
@@ -32,8 +31,8 @@
 
 xc='B3LYP'
 bas='def2-tzvpp'
-auxbasis='ccpvtz-jkfit'
 disp='d3bj'
+auxbasis='ccpvtz-jkfit'
 grids_level = 8
 
 def setUpModule():
@@ -46,14 +45,14 @@ def tearDownModule():
     global mol
     mol.stdout.close()
     del mol
-    
+
 eps = 1e-3
 
 class KnownValues(unittest.TestCase):
-    def test_geomopt(self):
+    def test_rks_geomopt(self):
         mf = rks.RKS(mol, xc=xc, disp=disp).density_fit()
         mf.grids.level = grids_level
-        e_tot = mf.kernel()
+        mf.kernel()
         mol_eq = optimize(mf, maxsteps=20)
         coords = mol_eq.atom_coords(unit='Ang')
         # reference from q-chem
@@ -61,22 +60,22 @@ def test_geomopt(self):
             [ 0.0000000000,     0.0000000000,     0.1164022656],
             [-0.7617088263,    -0.0000000000,    -0.4691011328],
             [0.7617088263,    -0.0000000000,    -0.4691011328]])
-        
+
         assert np.linalg.norm(coords - coords_qchem) < 1e-4
-    
-    def test_geomopt(self):
-        mf_GPU = scf.RHF(mol).density_fit()
-        e_tot = mf_GPU.kernel()
-        mol_eq = optimize(mf_GPU, maxsteps=20)
+
+    def test_rhf_geomopt(self):
+        mf = scf.RHF(mol).density_fit()
+        mf.kernel()
+        mol_eq = optimize(mf, maxsteps=20)
         coords = mol_eq.atom_coords(unit='Ang')
         # reference from q-chem
         coords_qchem = np.array([
             [0.0000000000,     0.0000000000,     0.1021249784],
             [-0.7519034531,    -0.0000000000,    -0.4619624892],
             [0.7519034531,    -0.0000000000,    -0.4619624892]])
-        
+
         assert np.linalg.norm(coords - coords_qchem) < 1e-4
-    
+
 if __name__ == "__main__":
     print("Full Tests for geometry optimization")
     unittest.main()
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index b4fd72b0..18cbba4a 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -125,8 +125,6 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     if mol is None: mol = ks.mol
     if dm is None: dm = ks.make_rdm1()
     t0 = logger.init_timer(ks)
-    if ks.grids.coords is None:
-        ks.grids.ao_values = None
     initialize_grids(ks, mol, dm)
 
     if hasattr(ks, 'screen_tol') and ks.screen_tol is not None:
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 82269257..c9157cc9 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -617,7 +617,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory):
             ao_dm0 = aow = None
     elif xctype == 'MGGA':
         if grids.level < 5:
-            logger.warn(mol, 'MGGA Hessian is sensitive to dft grids.')
+            log.warn('MGGA Hessian is sensitive to dft grids.')
         ao_deriv = 2
         for ao, mask, weight, coords \
                 in ni.block_loop(opt.mol, grids, nao, ao_deriv, max_memory):
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index aca3b082..36bb6737 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -15,27 +15,24 @@
 
 import numpy as np
 import cupy
-from cupy._environment import _preload_libs
-from cupyx import cutensor
-from cupy_backends.cuda.libs import cutensor as cutensor_backend
-from cupy_backends.cuda.libs.cutensor import Handle
 from gpu4pyscf.lib import logger
 
-libcutensor = None
-for lib_path in _preload_libs['cutensor']:
-    try:
-        libcutensor = _preload_libs['cutensor'][lib_path]
-        break
-    except Exception:
-        continue
+try:
+    import cupy_backends.cuda.libs.cutensor  # NOQA
+    from cupyx import cutensor
+    from cupy_backends.cuda.libs import cutensor as cutensor_backend
+    from cupy_backends.cuda.libs.cutensor import Handle
+
+    _handle = Handle()
+    _modes = {}
+    _contraction_descriptors = {}
+    _contraction_plans = {}
+    _contraction_finds = {}
 
-_handle = Handle()
-_modes = {}
-_contraction_descriptors = {}
-_contraction_plans = {}
-_contraction_finds = {}
+    cutensor_backend.init(_handle)
 
-cutensor_backend.init(_handle)
+except ImportError:
+    cutensor = None
 
 def _create_mode_with_cache(mode):
     integer_mode = []
@@ -156,8 +153,8 @@ def contraction(pattern, a, b, alpha, beta, out=None):
 else:
     contract_engine = None
 
-if libcutensor is None:
-    contract_engine = 'cupy'
+if cutensor is None:
+    contract_engine = 'cupy'  # default contraction engine
 
 # override the 'contract' function if einsum is customized or cutensor is not found
 if contract_engine is not None:
diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py
index 6367b440..7b46fb27 100644
--- a/gpu4pyscf/lib/logger.py
+++ b/gpu4pyscf/lib/logger.py
@@ -119,4 +119,3 @@ def new_logger(rec=None, verbose=None):
     else:
         log = Logger(rec.stdout, rec.verbose)
     return log
-
diff --git a/gpu4pyscf/scf/cphf.py b/gpu4pyscf/scf/cphf.py
index fdd45453..52b4a84f 100644
--- a/gpu4pyscf/scf/cphf.py
+++ b/gpu4pyscf/scf/cphf.py
@@ -28,7 +28,7 @@
 from gpu4pyscf.lib import logger
 
 def solve(fvind, mo_energy, mo_occ, h1, s1=None,
-          max_cycle=50, tol=1e-9, hermi=False, verbose=logger.WARN):
+          max_cycle=50, tol=1e-7, hermi=False, verbose=logger.WARN):
     '''
     Args:
         fvind : function
diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
index 88001e35..2a6d4904 100644
--- a/gpu4pyscf/solvent/grad/pcm.py
+++ b/gpu4pyscf/solvent/grad/pcm.py
@@ -140,9 +140,10 @@ def get_dD_dS(surface, dF, with_S=True, with_D=False):
 
     return dD, dS, dSii
 
-def grad_nuc(pcmobj):
-    if not pcmobj._intermediates:
-        pcmobj.build()
+def grad_nuc(pcmobj, dm):
+    if not pcmobj._intermediates or 'q_sym' not in pcmobj._intermediates:
+        pcmobj._get_vind(dm)
+
     mol = pcmobj.mol
     q_sym        = pcmobj._intermediates['q_sym'].get()
     gridslice    = pcmobj.surface['gslice_by_atom']
@@ -169,33 +170,23 @@ def grad_nuc(pcmobj):
     de -= numpy.asarray([numpy.sum(dv_g[p0:p1], axis=0) for p0,p1 in gridslice])
     return de
 
-def grad_elec(pcmobj, dm):
+def grad_qv(pcmobj, dm):
     '''
-    dE = 0.5*v* d(K^-1 R) *v + q*dv
-    v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q)
+    contributions due to integrals
     '''
-    if not pcmobj._intermediates:
-        pcmobj.build()
+    if not pcmobj._intermediates or 'q_sym' not in pcmobj._intermediates:
+        pcmobj._get_vind(dm)
 
     gridslice    = pcmobj.surface['gslice_by_atom']
-    v_grids      = pcmobj._intermediates['v_grids']
-    A            = pcmobj._intermediates['A']
-    D            = pcmobj._intermediates['D']
-    S            = pcmobj._intermediates['S']
-    K            = pcmobj._intermediates['K']
-    q            = pcmobj._intermediates['q']
     q_sym        = pcmobj._intermediates['q_sym']
 
-    vK_1 = cupy.linalg.solve(K.T, v_grids)
-
-    # ----------------- potential response -----------------------
-
     intopt = pcmobj.intopt
     intopt.clear()
     # rebuild with aosym
     intopt.build(1e-14, diag_block_with_triu=True, aosym=False)
     coeff = intopt.coeff
-    dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
+    dm_cart = coeff @ dm @ coeff.T
+    #dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
 
     dvj, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip1', q_sym, None, dm_cart)
     dq, _ = int3c2e.get_int3c2e_ip_jk(intopt, 0, 'ip2', q_sym, None, dm_cart)
@@ -208,15 +199,32 @@ def grad_elec(pcmobj, dm):
     dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
     dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
     de = dq + dvj
+    return de.get()
+
+def grad_solver(pcmobj, dm):
+    '''
+    dE = 0.5*v* d(K^-1 R) *v + q*dv
+    v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q)
+    '''
+    if not pcmobj._intermediates or 'q_sym' not in pcmobj._intermediates:
+        pcmobj._get_vind(dm)
+
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    v_grids      = pcmobj._intermediates['v_grids']
+    A            = pcmobj._intermediates['A']
+    D            = pcmobj._intermediates['D']
+    S            = pcmobj._intermediates['S']
+    K            = pcmobj._intermediates['K']
+    q            = pcmobj._intermediates['q']
+
+    vK_1 = cupy.linalg.solve(K.T, v_grids)
 
-    ## --------------- response from stiffness matrices ----------------
-    gridslice = pcmobj.surface['gslice_by_atom']
     dF, dA = get_dF_dA(pcmobj.surface)
 
-    with_D = pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE'
+    with_D = pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE']
     dD, dS, dSii = get_dD_dS(pcmobj.surface, dF, with_D=with_D, with_S=True)
 
-    if pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE':
+    if pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE']:
         DA = D*A
 
     epsilon = pcmobj.eps
@@ -224,13 +232,31 @@ def grad_elec(pcmobj, dm):
     #de_dF = v0 * -dSii_dF * q
     #de += 0.5*numpy.einsum('i,inx->nx', de_dF, dF)
     # dQ = v^T K^-1 (dR - dK K^-1 R) v
-    if pcmobj.method.upper() == 'C-PCM' or pcmobj.method.upper() == 'COSMO':
+    de = cupy.zeros([pcmobj.mol.natm,3])
+    if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
+        dS = dS.transpose([2,0,1])
+        dSii = dSii.transpose([2,0,1])
+
         # dR = 0, dK = dS
-        de_dS = cupy.einsum('i,ijx,j->ix', vK_1, dS, q)
+        de_dS = (vK_1 * dS.dot(q)).T                  # cupy.einsum('i,xij,j->ix', vK_1, dS, q)
         de -= cupy.asarray([cupy.sum(de_dS[p0:p1], axis=0) for p0,p1, in gridslice])
-        de -= 0.5*cupy.einsum('i,ijx,i->jx', vK_1, dSii, q)
+        de -= 0.5*contract('i,xij->jx', vK_1*q, dSii) # 0.5*cupy.einsum('i,xij,i->jx', vK_1, dSii, q)
+
+    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE']:
+        dD = dD.transpose([2,0,1])
+        dS = dS.transpose([2,0,1])
+        dSii = dSii.transpose([2,0,1])
+        dA = dA.transpose([2,0,1])
+        def contract_bra(a, B, c):
+            ''' i,xij,j->jx '''
+            tmp = contract('i,xij->xj', a, B)
+            return (tmp * c).T
+
+        def contract_ket(a, B, c):
+            ''' i,xij,j->ix '''
+            tmp = B.dot(c)
+            return (a*tmp).T
 
-    elif pcmobj.method.upper() == 'IEF-PCM' or pcmobj.method.upper() == 'SS(V)PE':
         # IEF-PCM and SS(V)PE formally are the same in gradient calculation
         # dR = f_eps/(2*pi) * (dD*A + D*dA),
         # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
@@ -238,30 +264,37 @@ def grad_elec(pcmobj, dm):
         fac = f_epsilon/(2.0*PI)
 
         Av = A*v_grids
-        de_dR  = 0.5*fac * cupy.einsum('i,ijx,j->ix', vK_1, dD, Av)
-        de_dR -= 0.5*fac * cupy.einsum('i,ijx,j->jx', vK_1, dD, Av)
+        de_dR  = 0.5*fac * contract_ket(vK_1, dD, Av)
+        de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av)
         de_dR  = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dR += 0.5*fac * cupy.einsum('i,ij,jnx,j->nx', vK_1, D, dA, v_grids)
 
-        de_dS0  = 0.5*cupy.einsum('i,ijx,j->ix', vK_1, dS, q)
-        de_dS0 -= 0.5*cupy.einsum('i,ijx,j->jx', vK_1, dS, q)
+        vK_1_D = vK_1.dot(D)
+        vK_1_Dv = vK_1_D * v_grids
+        de_dR += 0.5*fac * cupy.einsum('j,xjn->nx', vK_1_Dv, dA)
+
+        de_dS0  = 0.5*contract_ket(vK_1, dS, q)
+        de_dS0 -= 0.5*contract_bra(vK_1, dS, q)
         de_dS0  = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dS0 += 0.5*cupy.einsum('i,inx,i->nx', vK_1, dSii, q)
+
+        vK_1_q = vK_1 * q
+        de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii)
 
         vK_1_DA = cupy.dot(vK_1, DA)
-        de_dS1  = 0.5*cupy.einsum('j,jkx,k->jx', vK_1_DA, dS, q)
-        de_dS1 -= 0.5*cupy.einsum('j,jkx,k->kx', vK_1_DA, dS, q)
+        de_dS1  = 0.5*contract_ket(vK_1_DA, dS, q)
+        de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q)
         de_dS1  = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
-        de_dS1 += 0.5*cupy.einsum('j,jnx,j->nx', vK_1_DA, dSii, q)
+
+        vK_1_DAq = vK_1_DA*q
+        de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii)
 
         Sq = cupy.dot(S,q)
         ASq = A*Sq
-        de_dD  = 0.5*cupy.einsum('i,ijx,j->ix', vK_1, dD, ASq)
-        de_dD -= 0.5*cupy.einsum('i,ijx,j->jx', vK_1, dD, ASq)
+        de_dD  = 0.5*contract_ket(vK_1, dD, ASq)
+        de_dD -= 0.5*contract_bra(vK_1, dD, ASq)
         de_dD  = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
 
         vK_1_D = cupy.dot(vK_1, D)
-        de_dA = 0.5*cupy.einsum('j,jnx,j->nx', vK_1_D, dA, Sq)
+        de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA)   # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq)
 
         de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1)
         de += de_dR - de_dK
@@ -287,8 +320,9 @@ def kernel(self, *args, dm=None, atmlst=None, **kwargs):
             if dm is None:
                 dm = self.base.make_rdm1(ao_repr=True)
 
-            self.de_solvent = grad_elec(self.base.with_solvent, dm)
-            self.de_solvent+= grad_nuc(self.base.with_solvent)
+            self.de_solvent = grad_qv(self.base.with_solvent, dm)
+            self.de_solvent+= grad_solver(self.base.with_solvent, dm)
+            self.de_solvent+= grad_nuc(self.base.with_solvent, dm)
 
             self.de_solute = grad_method_class.kernel(self, *args, **kwargs)
             self.de = self.de_solute + self.de_solvent
diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py
index 5d2c4dac..4fe16261 100644
--- a/gpu4pyscf/solvent/hessian/pcm.py
+++ b/gpu4pyscf/solvent/hessian/pcm.py
@@ -25,7 +25,7 @@
 from pyscf import gto, df
 from pyscf.grad import rhf as rhf_grad
 from gpu4pyscf.solvent.pcm import PI, switch_h
-from gpu4pyscf.solvent.grad.pcm import grad_switch_h, get_dF_dA, get_dD_dS, grad_elec, grad_nuc
+from gpu4pyscf.solvent.grad.pcm import grad_switch_h, get_dF_dA, get_dD_dS, grad_qv, grad_solver, grad_nuc
 from gpu4pyscf.df import int3c2e
 from gpu4pyscf.lib.cupy_helper import contract
 from gpu4pyscf.lib import logger
@@ -86,6 +86,35 @@ def hess_nuc(pcmobj):
 
     return de
 
+def hess_qv(pcmobj, dm, verbose=None):
+    if not pcmobj._intermediates or 'q_sym' not in pcmobj._intermediates:
+        pcmobj._get_vind(dm)
+    gridslice    = pcmobj.surface['gslice_by_atom']
+    q_sym        = pcmobj._intermediates['q_sym']
+
+    intopt = pcmobj.intopt
+    intopt.clear()
+    # rebuild with aosym
+    intopt.build(1e-14, diag_block_with_triu=True, aosym=False)
+    coeff = intopt.coeff
+    dm_cart = coeff @ dm @ coeff.T
+    #dm_cart = cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff)
+
+    dvj, _ = int3c2e.get_int3c2e_ipip1_hjk(intopt, q_sym, None, dm_cart, with_k=False)
+    dq, _ = int3c2e.get_int3c2e_ipvip1_hjk(intopt, q_sym, None, dm_cart, with_k=False)
+    dvj, _ = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, q_sym, None, dm_cart, with_k=False)
+    dq, _ = int3c2e.get_int3c2e_ipip2_hjk(intopt, q_sym, None, dm_cart, with_k=False)
+
+    cart_ao_idx = intopt.cart_ao_idx
+    rev_cart_ao_idx = numpy.argsort(cart_ao_idx)
+    dvj = dvj[:,rev_cart_ao_idx]
+
+    aoslice = intopt.mol.aoslice_by_atom()
+    dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
+    dvj= 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]])
+    de = dq + dvj
+    return de.get()
+
 def hess_elec(pcmobj, dm, verbose=None):
     '''
     slow version with finite difference
@@ -98,10 +127,11 @@ def hess_elec(pcmobj, dm, verbose=None):
     coords = mol.atom_coords(unit='Bohr')
 
     def pcm_grad_scanner(mol):
+        # TODO: use more analytical forms
         pcmobj.reset(mol)
         e, v = pcmobj._get_vind(dm)
         #return grad_elec(pcmobj, dm)
-        return grad_nuc(pcmobj) + grad_elec(pcmobj, dm)
+        return grad_nuc(pcmobj, dm) + grad_solver(pcmobj, dm) + grad_qv(pcmobj, dm)
 
     de = numpy.zeros([mol.natm, mol.natm, 3, 3])
     eps = 1e-3
@@ -121,7 +151,7 @@ def pcm_grad_scanner(mol):
     pcmobj.reset(pmol)
     return de
 
-def grad_qv(pcmobj, mo_coeff, mo_occ, atmlst=None, verbose=None):
+def fd_grad_vmat(pcmobj, mo_coeff, mo_occ, atmlst=None, verbose=None):
     '''
     dv_solv / da
     slow version with finite difference
@@ -164,6 +194,35 @@ def pcm_vmat_scanner(mol):
     pcmobj.reset(pmol)
     return vmat
 
+"""
+def analytic_grad_vmat(pcmobj, mo_coeff, mo_occ, atmlst=None, verbose=None):
+    '''
+    dv_solv / da
+    slow version with finite difference
+    '''
+    log = logger.new_logger(pcmobj, verbose)
+    t1 = log.init_timer()
+    pmol = pcmobj.mol.copy()
+    mol = pmol.copy()
+    if atmlst is None:
+        atmlst = range(mol.natm)
+    nao, nmo = mo_coeff.shape
+    mocc = mo_coeff[:,mo_occ>0]
+    nocc = mocc.shape[1]
+    dm = cupy.dot(mocc, mocc.T) * 2
+    coords = mol.atom_coords(unit='Bohr')
+
+    # TODO: add those contributions
+    # contribution due to _get_v
+    # contribution due to linear solver
+    # contribution due to _get_vmat
+
+    vmat = cupy.zeros([len(atmlst), 3, nao, nocc])
+
+    t1 = log.timer_debug1('computing solvent grad veff', *t1)
+    pcmobj.reset(pmol)
+    return vmat
+"""
 def make_hess_object(hess_method):
     '''
     return solvent hessian object
@@ -193,7 +252,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             if atmlst is None:
                 atmlst = range(self.mol.natm)
             h1ao = hess_method_class.make_h1(self, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
-            dv = grad_qv(self.base.with_solvent, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
+            dv = fd_grad_vmat(self.base.with_solvent, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose)
             for i0, ia in enumerate(atmlst):
                 h1ao[i0] += dv[i0]
             return h1ao
diff --git a/gpu4pyscf/solvent/pcm.py b/gpu4pyscf/solvent/pcm.py
index e9634628..0604b046 100644
--- a/gpu4pyscf/solvent/pcm.py
+++ b/gpu4pyscf/solvent/pcm.py
@@ -243,7 +243,7 @@ def build(self, ng=None):
         D, S = get_D_S(self.surface, with_S=True, with_D=True)
 
         epsilon = self.eps
-        if self.method.upper() == 'C-PCM':
+        if self.method.upper() in ['C-PCM', 'CPCM']:
             f_epsilon = (epsilon-1.)/epsilon
             K = S
             R = -f_epsilon * cupy.eye(K.shape[0])
@@ -251,7 +251,7 @@ def build(self, ng=None):
             f_epsilon = (epsilon - 1.0)/(epsilon + 1.0/2.0)
             K = S
             R = -f_epsilon * cupy.eye(K.shape[0])
-        elif self.method.upper() == 'IEF-PCM':
+        elif self.method.upper() in ['IEF-PCM', 'IEFPCM']:
             f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
             DA = D*A
             DAS = cupy.dot(DA, S)
diff --git a/gpu4pyscf/solvent/tests/test_pcm_grad.py b/gpu4pyscf/solvent/tests/test_pcm_grad.py
index 677aa285..2cacfbd7 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_grad.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_grad.py
@@ -109,14 +109,14 @@ def test_grad_CPCM(self):
         mf = scf.RHF(mol).PCM(cm)
         mf.verbose = 0
         mf.conv_tol = 1e-12
-        e_tot = mf.kernel()
+        mf.kernel()
 
         g = mf.nuc_grad_method()
         grad = g.kernel()
 
         g0 = numpy.asarray([
-             [0.49773047433563E-15,  -0.12128126037559E-15,  -0.58936988992306E-01],
-             [0.22810111996954E-01,  -0.68951901317025E-17,   0.29468494708267E-01],
+            [0.49773047433563E-15,  -0.12128126037559E-15,  -0.58936988992306E-01],
+            [0.22810111996954E-01,  -0.68951901317025E-17,   0.29468494708267E-01],
             [-0.22810111996957E-01,   0.12949813945902E-15,   0.29468494708266E-01]])
 
         print(f"Gradient error in CPCM: {numpy.linalg.norm(g0 - grad)}")
@@ -131,7 +131,7 @@ def test_grad_COSMO(self):
         mf = scf.RHF(mol).PCM(cm)
         mf.verbose = 0
         mf.conv_tol = 1e-12
-        e_tot = mf.kernel()
+        mf.kernel()
 
         g = mf.nuc_grad_method()
         grad = g.kernel()
@@ -153,14 +153,14 @@ def test_grad_IEFPCM(self):
         mf = scf.RHF(mol).PCM(cm)
         mf.verbose = 0
         mf.conv_tol = 1e-12
-        e_tot = mf.kernel()
+        mf.kernel()
 
         g = mf.nuc_grad_method()
         grad = g.kernel()
 
         g0 = numpy.asarray([
-             [0.18357915015649E-14,   0.14192681822347E-15,  -0.58988087999658E-01],
-             [0.22822709179063E-01,  -0.10002010417168E-15,   0.29494044211805E-01],
+            [0.18357915015649E-14,   0.14192681822347E-15,  -0.58988087999658E-01],
+            [0.22822709179063E-01,  -0.10002010417168E-15,   0.29494044211805E-01],
             [-0.22822709179066E-01,  -0.31051364515588E-16,   0.29494044211806E-01]])
         print(f"Gradient error in IEFPCM: {numpy.linalg.norm(g0 - grad)}")
         assert numpy.linalg.norm(g0 - grad) < 1e-9
@@ -174,14 +174,14 @@ def test_grad_SSVPE(self):
         mf = scf.RHF(mol).PCM(cm)
         mf.verbose = 0
         mf.conv_tol = 1e-12
-        e_tot = mf.kernel()
+        mf.kernel()
 
         g = mf.nuc_grad_method()
         grad = g.kernel()
 
         g0 = numpy.asarray([
-             [0.76104817971710E-15,   0.11185701540547E-15,  -0.58909172879217E-01],
-             [0.22862990009767E-01,  -0.13861633974903E-15,   0.29454586651678E-01],
+            [0.76104817971710E-15,   0.11185701540547E-15,  -0.58909172879217E-01],
+            [0.22862990009767E-01,  -0.13861633974903E-15,   0.29454586651678E-01],
             [-0.22862990009769E-01,   0.34988765678591E-16,   0.29454586651679E-01]])
         print(f"Gradient error in SS(V)PE: {numpy.linalg.norm(g0 - grad)}")
         assert numpy.linalg.norm(g0 - grad) < 1e-9
diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
index ff6074d4..5a21ca00 100644
--- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py
+++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py
@@ -42,11 +42,8 @@ def tearDownModule():
     mol.stdout.close()
     del mol
 
-def _check_hessian(method='C-PCM', ix=0, iy=0):
-    pmol = mol.copy()
-    pmol.build()
-
-    mf = dft.rks.RKS(pmol, xc=xc).density_fit().PCM()
+def _make_mf(method='C-PCM'):
+    mf = dft.rks.RKS(mol, xc=xc).density_fit().PCM()
     mf.with_solvent.method = method
     mf.with_solvent.eps = epsilon
     mf.with_solvent.lebedev_order = lebedev_order
@@ -54,6 +51,11 @@ def _check_hessian(method='C-PCM', ix=0, iy=0):
     mf.grids.atom_grid = (99,590)
     mf.verbose = 0
     mf.kernel()
+    return mf
+
+def _check_hessian(mf, h, ix=0, iy=0):
+    pmol = mf.mol.copy()
+    pmol.build()
 
     g = mf.nuc_grad_method()
     g.auxbasis_response = True
@@ -72,28 +74,28 @@ def _check_hessian(method='C-PCM', ix=0, iy=0):
     _, g1 = g_scanner(pmol)
 
     h_fd = (g0 - g1)/2.0/eps
-    pmol.set_geom_(coords, unit='Bohr')
-    pmol.build()
-
-    hobj = mf.Hessian()
-    hobj.set(auxbasis_response=2)
-    h = hobj.kernel()
 
-    print(f"analytical Hessian H({ix},{iy})")
-    print(h[ix,:,iy,:])
-    print(f"finite different Hessian H({ix},{iy})")
-    print(h_fd)
-    print('Norm of diff', np.linalg.norm(h[ix,:,iy,:] - h_fd))
+    print(f'Norm of H({ix},{iy}) diff, {np.linalg.norm(h[ix,:,iy,:] - h_fd)}')
     assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol)
 
 class KnownValues(unittest.TestCase):
     def test_hess_cpcm(self):
-        _check_hessian(method='C-PCM', ix=0, iy=0)
-        _check_hessian(method='C-PCM', ix=0, iy=1)
+        print('testing C-PCM Hessian')
+        mf = _make_mf(method='C-PCM')
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_hessian(mf, h, ix=0, iy=0)
+        _check_hessian(mf, h, ix=0, iy=1)
 
     def test_hess_iefpcm(self):
-        _check_hessian(method='IEF-PCM', ix=0, iy=0)
-        _check_hessian(method='IEF-PCM', ix=0, iy=1)
+        print("testing IEF-PCM hessian")
+        mf = _make_mf(method='IEF-PCM')
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_hessian(mf, h, ix=0, iy=0)
+        _check_hessian(mf, h, ix=0, iy=1)
 
 if __name__ == "__main__":
     print("Full Tests for Hessian of PCMs")