Merge branch 'master' of github.com:argonne-lcf/sdl_workshop

argonne-lcf · Oct 6, 2022 · d0c7296 · d0c7296
2 parents 04e341d + 808e661
commit d0c7296
Show file tree

Hide file tree

Showing 114 changed files with 871 additions and 67 deletions.
diff --git a/couplingSimulationML/ML_PythonC++_Embedding/README.md b/couplingSimulationML/ML_PythonC++_Embedding/README.md
@@ -1,4 +1,4 @@
-This example was written by Romit Maulik and edited by Bethany Lusch. 
+This example was originally written by Romit Maulik, modified by Saumil Patel to use OCCA, and edited by Bethany Lusch.
 
 # Description
 
@@ -12,30 +12,19 @@ In addition, this code also highlights the advantages of integrating the Python
 3. A potential interface (if there are no issues with security) to streaming data from the internet (from say, a Python API).
 4. Easy ability to save data using formats like HDF5 or NetCDF4.
 
-The test-case demonstrated here is representative of several Sci-ML workloads. We aim to build a surrogate model using TensorFlow in Python from data generated by a C++ computation. The methodology we utilize is something called the "POD-LSTM" - here snapshots of the solution field are linearly compressed using an SVD and the compressed representations are used as training data within a long short-term memory (LSTM) neural network. The LSTM is used to forecast compressed representations of the solution field in the future (for more details please visit our Editor's pick article [here](https://doi.org/10.1063/5.0019884)). However, this educational proxy-app may easily be modified to solve more complex problems (for example: closure modeling, data assimilation, and control) which assess the interplay of compute and ML.
+The test-case demonstrated here aims to capture a modal decomposition using an SVD.  This is representative of a Sci-ML workload. We aim to build this toward a surrogate model using TensorFlow in Python from data generated by a C++ computation. Further details on developing the surrogate model can be found [here](archive/ThetaGPU/Background.md)
 
-Here is how we connected C++ and Python:
+Here is how we connected C++ and Python, although we didn't include the neural network training in this demo:
 ![Coupling](CouplingDiagram.png)
 
-For running this mini-app on ThetaKNL/GPU, look at the scripts/README within `ML_PythonC++_Embedding/ThetaGPU/` and `ML_PythonC++_Embedding/ThetaKNL/` subdirectories.
+For running this mini-app on ThetaGPU, look at the scripts/README within `ML_PythonC++_Embedding/ThetaGPU_OCCA/` subdirectory.
 
 To see a similar coupling but with OpenFOAM on the C++ side (CFD software), check out our PythonFOAM work ([paper](https://arxiv.org/pdf/2103.09389.pdf), [code](https://github.com/argonne-lcf/PythonFOAM)). 
 
-## What you should see when you run this on either ThetaGPU or ThetaKNL
+## What you should see when you run this on ThetaGPU
 
 ### Field evolution
-![Fields](ThetaKNL/app_build/Field_evolution.png "Fields")
+![Fields](archive/ThetaKNL/app_build/Field_evolution.png "Fields")
 
 ### Modal decomposition
-![Modes](ThetaKNL/app_build/SVD_Eigenvectors.png "Modes")
-
-### Forecasting the modal evolution in time (still rather poor but you get the idea)
-![Forecasting Mode 0](ThetaKNL/app_build/Mode_0_prediction.png "Mode 0 prediction")
-
-![Forecasting Mode 1](ThetaKNL/app_build/Mode_1_prediction.png "Mode 1 prediction")
-
-![Forecasting Mode 2](ThetaKNL/app_build/Mode_2_prediction.png "Mode 2 prediction")
-
-
-In order to get good results in [our paper](https://doi.org/10.1063/5.0019884), we use more training data. The code for that paper is [here](https://github.com/rmjcs2020/NATSurrogates). 
-
+![Modes](archive/ThetaKNL/app_build/SVD_Eigenvectors.png "Modes")
diff --git a/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/CMakeLists.txt b/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/CMakeLists.txt
@@ -0,0 +1,84 @@
+cmake_minimum_required(VERSION 3.0)
+project(burger VERSION 0.1)
+
+enable_language(C)
+enable_language(CXX)
+enable_language(Fortran)
+enable_testing()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+include(GNUInstallDirs)
+include(FortranCInterface)
+
+set(dfloat "double")
+set(dfloatString "\"double\"")
+set(dfloatFormat "\"%lf\"")
+set(dlong "long int")
+set(dlongString "\"long int\"")
+set(dlongFormat="\"%ld\"")
+set(hlong "long long int")
+set(hlongString "\"long long int\"")
+set(hlongFormat "\"%lld\"")
+set(MPI_DFLOAT "MPI_DOUBLE")
+set(MPI_DLONG "MPI_LONG_INT")
+set(MPI_HLONG "MPI_LONG_LONG_INT")
+
+#option(USE_MPI "Use the MPI library." ON)
+#if(USE_MPI)
+#    find_package(MPI)
+#endif()
+
+#option(EXTERNAL_BLASLAPACK "Use an external/vendor-provided Blas/Lapack library" OFF)
+#if(EXTERNAL_BLASLAPACK)
+#  find_package(BLAS)
+#  find_package(LAPACK)
+#endif()
+
+option(EXTERNAL_OCCA "Use an external OCCA build" ON)
+if(EXTERNAL_OCCA)
+  find_library(OCCA_LIBRARY_PATH NAMES occa libocca PATHS $ENV{OCCA_DIR}/lib)
+  find_path(OCCA_INCLUDE_PATH NAMES occa.hpp occa.h PATHS $ENV{OCCA_DIR}/include)
+  add_library(libocca UNKNOWN IMPORTED)
+  set_target_properties(libocca PROPERTIES IMPORTED_LOCATION ${OCCA_LIBRARY_PATH})
+  target_include_directories(libocca INTERFACE ${OCCA_INCLUDE_PATH})
+endif()
+
+add_compile_definitions(
+    "USE_OCCA_MEM_BYTE_ALIGN=64"
+    "dfloat=double"
+    "dfloatString=\"double\""
+    "dfloatFormat=\"%lf\""
+    "MPI_DFLOAT=MPI_DOUBLE"
+    "dlong=int"
+    "dlongString=\"int\""
+    "dlongFormat=\"%d\""
+    "MPI_DLONG=MPI_INT"
+    "hlong=long long int"
+    "hlongString=\"long long int\""
+    "hlongFormat=\"%lld\""
+    "MPI_HLONG=MPI_LONG_LONG_INT"
+) 
+
+add_executable(burger main.cpp)
+
+include_directories(/lus/grand/projects/catalyst/world-shared/spatel/occa/src)
+
+include_directories(/lus/theta-fs0/software/thetagpu/conda/2021-11-30/mconda3/lib/python3.8/site-packages/tensorflow/include/external/local_config_python/numpy_include)
+include_directories(/lus/theta-fs0/software/thetagpu/conda/2021-11-30/mconda3/include)
+include_directories(/lus/theta-fs0/software/thetagpu/conda/2021-11-30/mconda3/include/python3.8)
+include_directories(/soft/hpc-sdk/Linux_x86_64/21.7/cuda/include)
+
+find_library(PYTHONLIB python3.8 HINTS /lus/theta-fs0/software/thetagpu/conda/2021-11-30/mconda3/lib)
+
+target_include_directories(burger PUBLIC ${CMAKE_SOURCE_DIR}/core)
+target_link_libraries(burger 
+  PUBLIC
+    libocca
+    ${PYTHONLIB}
+)
+
+install(TARGETS burger RUNTIME DESTINATION ".")
+install(DIRECTORY kernel/ DESTINATION kernel)
+install(FILES python_module.py DESTINATION ".")
diff --git a/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/README.md b/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/README.md
@@ -0,0 +1,192 @@
+# Description
+
+The goal of this implementation is to provide an example of how one can integrate a python-based, machine learning (ML) framework within a computational physics (PDE) solver.  Like most GPU-enabled solvers, the physics kernel is executed on the device where critical field data resides. This implementation makes use of the [CuPY](https://cupy.dev/) framework to perform in-situ analysis on the device, thereby, avoiding the cost of data movement to host. Furthermore, this example demonstrates how to couple the ML workflow with an application that uses a performance-portability abstraction layer, namely [OCCA](https://github.com/libocca/occa), which executes physics kernels on the device for a variety backend-specific programming models (e.g. CUDA, HIP, SYCL).    
+
+## Requirements
+
+- [OCCA](https://github.com/libocca/occa)
+- cmake
+- C++17 compiler
+- C11 compiler
+- CUDA 9 or later
+- Virtual Python Environment
+
+All of the above are provided on ThetaGPU
+
+## Building and Running 
+
+We assume that you have cloned the repo to a suitable location. These are the steps to execute this code on ThetaGPU (interactively):
+
+1. From the theta login node, please Login to a ThetaGPU service node
+```
+ssh thetagpusn1
+```
+2. Request an interactive session on an A100 GPU
+```
+qsub -A SDL_Workshop \
+     -q training-gpu \
+     -I \
+     -n 1 \
+     --attrs filesystems=home,grand,eagle \
+     -t 60
+```
+3. Set the Environment
+
+Edit `OCCA_CACHE_DIR` in `set_OCCA_env.sh.` Then run `source set_OCCA_env.sh`. This loads modules and sets certain environment variables. 
+
+```
+$ cat set_OCCA_env.sh
+
+module load cmake
+module load conda/2021-11-30
+
+SDL=sdl_workshop/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/
+
+export OCCA_DIR=/lus/grand/projects/catalyst/world-shared/spatel/occa/install
+export OCCA_CACHE_DIR=/path/to/$SDL
+
+export OCCA_CXX="g++"
+export OCCA_CXXFLAGS="-O3"
+export OCCA_CUDA_COMPILER="nvcc"
+export OCCA_CUDA_COMPILER_FLAGS="-O3 --fmad=true"
+
+export PATH+=":${OCCA_DIR}/bin"
+export LD_LIBRARY_PATH+=":${OCCA_DIR}/lib"
+```
+NOTE: the `OCCA_CACHE_DIR` specifies a location where OCCA-specific kernels (.okl) are stored/cached. Please make note of your path to this directory. 
+
+4. Check the environment.
+```
+$ module list
+
+Currently Loaded Modules:
+  1) Core/StdEnv   2) cmake/3.19.5   3) openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0   4) conda/2021-11-30
+
+```
+```
+$ occa info
+    ========+======================+=================================
+     CPU(s) | Processor Name       | AMD EPYC 7742 64-Core Processor 
+            | Memory               | 1007.69 GB                      
+            | Clock Frequency      | 2.2 MHz                         
+            | SIMD Instruction Set | SSE2                            
+            | SIMD Width           | 128 bits                        
+            | L1d Cache Size       |   4 MB                          
+            | L1i Cache Size       |   4 MB                          
+            | L2 Cache Size        |  64 MB                          
+            | L3 Cache Size        | 512 MB                          
+    ========+======================+=================================
+     CUDA   | Device Name          | NVIDIA A100-SXM4-40GB           
+            | Device ID            | 0                               
+            | Memory               | 39.58 GB                        
+    ========+======================+=================================
+```
+5. Activate the virtual Python environment
+
+```
+$ conda activate
+```
+6. Build by running the `build.sh` script from this directory. This script is the cmake driver.  
+
+```
+$ sh build.sh
+```
+You should some output that looks something like:
+```
+-- Build files have been written to: ... 
++ cmake --build .../ML_PythonC++_Embedding/ThetaGPU_OCCA/build --parallel 4
+Scanning dependencies of target burger
+[ 50%] Building CXX object CMakeFiles/burger.dir/main.cpp.o
+[100%] Linking CXX executable burger
+[100%] Built target burger
++ cmake --install ... --prefix ... 
+-- Install configuration: "RelWithDebInfo"
+-- Installing: .../ML_PythonC++_Embedding/ThetaGPU_OCCA/install/./burger
+-- Set runtime path of ".../ThetaGPU_OCCA/install/./burger" to ""
+-- Installing: .../ML_PythonC++_Embedding/ThetaGPU_OCCA/install/kernel
+-- Installing: .../ML_PythonC++_Embedding/ThetaGPU_OCCA/install/kernel/burger.okl
+-- Installing: .../ML_PythonC++_Embedding/ThetaGPU_OCCA/install/./python_module.py
+```
+7. Run
+
+```
+$ cd install/
+$ ls
+burger	kernel	python_module.py
+
+$ ls kernel/
+burger.okl
+
+$ ./burger
+Initialization of Python: Done
+Within Python Module File
+Loaded Python Module File: Done
+Loaded Functions: Done
+Called python data collection function successfully
+time = 0.001
+Called python data collection function successfully
+Called python data collection function successfully
+Called python data collection function successfully
+...
+...
+...
+Called python data collection function successfully
+Called python data collection function successfully
+time = 2.001
+Mean Wall-Time: 0.0469978
+A random value in the solution array: 0.0733873
+Called python analyses function successfully
+Performing SVD
+Called python analyses function successfully
+
+$ ls
+burger	Field_evolution.png  kernel python_module.py  SVD_Eigenvectors.png
+```
+
+## Key Features
+
+### OCCA API: Host, Device & Memory
+The host, usually a CPU processor, is the physical device that runs the application. A device can be a physical device which can be the host (i.e. a CPU) or an offload device - one that is physically distinct from the host.  In this example, our kernels run on an offload device. OCCA enables the user to connect to the physical device through the OCCA API.  For example, the following snippet is one way to instantiate the device:
+```
+  occa::device device;
+
+  device.setup((std::string) args["options/device"]);
+  device.setup({
+     {"mode"     , "CUDA"},
+     {"device_id", 0},
+   });   
+```
+The OCCA API is also used for memory allocation.  This is done using the [malloc](https://libocca.org/#/api/device/malloc) method on a device object. 
+```
+  int N = 1000;
+  occa::memory o_uh, o_uh_prev;
+
+  o_uh = device.malloc<double>(N);
+  o_uh_prev = device.malloc<double>(N);
+
+```
+To get the backend pointer, one can do:
+```
+  double *d_b = static_cast<double *>(o_uh.ptr());
+```
+This exposes the address set by the backend model (e.g. CUDA) and hardware (e.g. NVIDIA A100). Doing this will be helpful as we perform in-situ analysis on data resident on the device.  
+
+### Using CuPY to enable zero-copy, in-situ analysis
+In CuPY, `cupy.ndarray` is the counterpart of the NumPy `numpy.ndarray` which provides an interface for fixed-size multi-dimensional array which resides on a CUDA device.  Low-level CUDA support in CuPY allows us to retreive device memory. For example,
+
+```
+import cupy
+from cupy.cuda import memory
+
+  def my_function(a):
+      b = cupy.ndarray(
+                  a.__array_interface__['shape'][0],
+                  cupy.dtype(a.dtype.name),
+                  cupy.cuda.MemoryPointer(cupy.cuda.UnownedMemory(
+                                             a.__array_interface__['data'][0], #<---Pointer?
+                                             a.size,
+                                             a,
+                                             0), 0),
+                  strides=a.__array_interface__['strides'])
+
+```
diff --git a/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/build.sh b/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/build.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -x
+#-----
+EXTERNAL_BLASLAPACK="OFF"
+EXTERNAL_OCCA="ON"
+
+PREFIX_PATHS=
+
+# Default build parameters
+: ${BUILD_DIR:=`pwd`/build}
+: ${INSTALL_DIR:=`pwd`/install}
+: ${BUILD_TYPE:="RelWithDebInfo"}
+
+: ${CC:="gcc"}
+: ${CXX:="g++"}
+: ${FC:="gfortran"}
+
+: ${MPICC:="mpicc"}
+: ${MPICXX:="mpicxx"}
+: ${MPIFC:="mpif77"}
+
+: ${EXTERNAL_BLASLAPACK:="OFF"}
+: ${EXTERNAL_OCCA:="ON"}
+
+# OCCA Configuration
+: ${ENABLE_DPCPP:="OFF"}
+: ${ENABLE_OPENCL:="OFF"}
+: ${ENABLE_CUDA:="ON"}
+: ${ENABLE_HIP="OFF"}
+: ${ENABLE_OPENMP="ON"}
+: ${ENABLE_METAL="OFF"}
+: ${ENABLE_MPI="ON"}
+
+cmake -S . -B ${BUILD_DIR} \
+  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+  -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} \
+  -DCMAKE_PREFIX_PATH=${PREFIX_PATHS} \
+  -DCMAKE_C_COMPILER=${CC} \
+  -DCMAKE_CXX_COMPILER=${CXX} \
+  -DCMAKE_Fortran_COMPILER=${FC} \
+  -DMPI_C_COMPILER=${MPICC} \
+  -DMPI_CXX_COMPILER=${MPICXX} \
+  -DMPI_Fortran_COMPILER=${MPIFC} \
+  -DEXTERNAL_BLASLAPACK=${EXTERNAL_BLASLAPACK} \
+  -DEXTERNAL_OCCA=${EXTERNAL_OCCA} \
+  -DENABLE_DPCPP=${ENABLE_DPCPP} \
+  -DENABLE_OPENCL=${ENABLE_OPENCL} \
+  -DENABLE_CUDA=${ENABLE_CUDA} \
+  -DENABLE_HIP=${ENABLE_HIP} \
+  -DENABLE_OPENMP=${ENABLE_OPENMP} \
+  -DENABLE_METAL=${ENABLE_METAL} \
+  -DENABLE_MPI=${ENABLE_MPI}
+
+cmake --build ${BUILD_DIR} --parallel 4 && \
+cmake --install ${BUILD_DIR} --prefix ${INSTALL_DIR}
diff --git a/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/kernel/burger.okl b/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/kernel/burger.okl
@@ -0,0 +1,19 @@
+@kernel void update_burger(const int Npts,
+                           const double c1,  // inertial
+                           const double c2,  // viscous
+                           @restrict double *u_prev,
+                           @restrict double *RHS) {
+
+  for (int i = 0; i < Npts; ++i; @tile(4,@outer,@inner)) {
+
+    if (i > 0 && i < Npts-1 ) {
+        RHS[i] = u_prev[i] + c2*(u_prev[i+1] - 2.0*u_prev[i] + u_prev[i-1]) - u_prev[i]*c1*(u_prev[i+1] - u_prev[i-1]);
+    } else if (i == 0) {
+        RHS[i] = u_prev[Npts-2];
+    } else if (i >= Npts-1) {
+        RHS[i] = u_prev[1];
+    }
+
+  }
+
+}