Skip to content

Commit

Permalink
Merge branch 'master' of github.com:argonne-lcf/sdl_workshop
Browse files Browse the repository at this point in the history
  • Loading branch information
rickybalin committed Oct 6, 2022
2 parents 04e341d + 808e661 commit d0c7296
Show file tree
Hide file tree
Showing 114 changed files with 871 additions and 67 deletions.
25 changes: 7 additions & 18 deletions couplingSimulationML/ML_PythonC++_Embedding/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
This example was written by Romit Maulik and edited by Bethany Lusch.
This example was originally written by Romit Maulik, modified by Saumil Patel to use OCCA, and edited by Bethany Lusch.

# Description

Expand All @@ -12,30 +12,19 @@ In addition, this code also highlights the advantages of integrating the Python
3. A potential interface (if there are no issues with security) to streaming data from the internet (from say, a Python API).
4. Easy ability to save data using formats like HDF5 or NetCDF4.

The test-case demonstrated here is representative of several Sci-ML workloads. We aim to build a surrogate model using TensorFlow in Python from data generated by a C++ computation. The methodology we utilize is something called the "POD-LSTM" - here snapshots of the solution field are linearly compressed using an SVD and the compressed representations are used as training data within a long short-term memory (LSTM) neural network. The LSTM is used to forecast compressed representations of the solution field in the future (for more details please visit our Editor's pick article [here](https://doi.org/10.1063/5.0019884)). However, this educational proxy-app may easily be modified to solve more complex problems (for example: closure modeling, data assimilation, and control) which assess the interplay of compute and ML.
The test-case demonstrated here aims to capture a modal decomposition using an SVD. This is representative of a Sci-ML workload. We aim to build this toward a surrogate model using TensorFlow in Python from data generated by a C++ computation. Further details on developing the surrogate model can be found [here](archive/ThetaGPU/Background.md)

Here is how we connected C++ and Python:
Here is how we connected C++ and Python, although we didn't include the neural network training in this demo:
![Coupling](CouplingDiagram.png)

For running this mini-app on ThetaKNL/GPU, look at the scripts/README within `ML_PythonC++_Embedding/ThetaGPU/` and `ML_PythonC++_Embedding/ThetaKNL/` subdirectories.
For running this mini-app on ThetaGPU, look at the scripts/README within `ML_PythonC++_Embedding/ThetaGPU_OCCA/` subdirectory.

To see a similar coupling but with OpenFOAM on the C++ side (CFD software), check out our PythonFOAM work ([paper](https://arxiv.org/pdf/2103.09389.pdf), [code](https://github.com/argonne-lcf/PythonFOAM)).

## What you should see when you run this on either ThetaGPU or ThetaKNL
## What you should see when you run this on ThetaGPU

### Field evolution
![Fields](ThetaKNL/app_build/Field_evolution.png "Fields")
![Fields](archive/ThetaKNL/app_build/Field_evolution.png "Fields")

### Modal decomposition
![Modes](ThetaKNL/app_build/SVD_Eigenvectors.png "Modes")

### Forecasting the modal evolution in time (still rather poor but you get the idea)
![Forecasting Mode 0](ThetaKNL/app_build/Mode_0_prediction.png "Mode 0 prediction")

![Forecasting Mode 1](ThetaKNL/app_build/Mode_1_prediction.png "Mode 1 prediction")

![Forecasting Mode 2](ThetaKNL/app_build/Mode_2_prediction.png "Mode 2 prediction")


In order to get good results in [our paper](https://doi.org/10.1063/5.0019884), we use more training data. The code for that paper is [here](https://github.com/rmjcs2020/NATSurrogates).

![Modes](archive/ThetaKNL/app_build/SVD_Eigenvectors.png "Modes")
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
cmake_minimum_required(VERSION 3.0)
project(burger VERSION 0.1)

enable_language(C)
enable_language(CXX)
enable_language(Fortran)
enable_testing()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

include(GNUInstallDirs)
include(FortranCInterface)

set(dfloat "double")
set(dfloatString "\"double\"")
set(dfloatFormat "\"%lf\"")
set(dlong "long int")
set(dlongString "\"long int\"")
set(dlongFormat="\"%ld\"")
set(hlong "long long int")
set(hlongString "\"long long int\"")
set(hlongFormat "\"%lld\"")
set(MPI_DFLOAT "MPI_DOUBLE")
set(MPI_DLONG "MPI_LONG_INT")
set(MPI_HLONG "MPI_LONG_LONG_INT")

#option(USE_MPI "Use the MPI library." ON)
#if(USE_MPI)
# find_package(MPI)
#endif()

#option(EXTERNAL_BLASLAPACK "Use an external/vendor-provided Blas/Lapack library" OFF)
#if(EXTERNAL_BLASLAPACK)
# find_package(BLAS)
# find_package(LAPACK)
#endif()

option(EXTERNAL_OCCA "Use an external OCCA build" ON)
if(EXTERNAL_OCCA)
find_library(OCCA_LIBRARY_PATH NAMES occa libocca PATHS $ENV{OCCA_DIR}/lib)
find_path(OCCA_INCLUDE_PATH NAMES occa.hpp occa.h PATHS $ENV{OCCA_DIR}/include)
add_library(libocca UNKNOWN IMPORTED)
set_target_properties(libocca PROPERTIES IMPORTED_LOCATION ${OCCA_LIBRARY_PATH})
target_include_directories(libocca INTERFACE ${OCCA_INCLUDE_PATH})
endif()

add_compile_definitions(
"USE_OCCA_MEM_BYTE_ALIGN=64"
"dfloat=double"
"dfloatString=\"double\""
"dfloatFormat=\"%lf\""
"MPI_DFLOAT=MPI_DOUBLE"
"dlong=int"
"dlongString=\"int\""
"dlongFormat=\"%d\""
"MPI_DLONG=MPI_INT"
"hlong=long long int"
"hlongString=\"long long int\""
"hlongFormat=\"%lld\""
"MPI_HLONG=MPI_LONG_LONG_INT"
)

add_executable(burger main.cpp)

include_directories(/lus/grand/projects/catalyst/world-shared/spatel/occa/src)

include_directories(/lus/theta-fs0/software/thetagpu/conda/2021-11-30/mconda3/lib/python3.8/site-packages/tensorflow/include/external/local_config_python/numpy_include)
include_directories(/lus/theta-fs0/software/thetagpu/conda/2021-11-30/mconda3/include)
include_directories(/lus/theta-fs0/software/thetagpu/conda/2021-11-30/mconda3/include/python3.8)
include_directories(/soft/hpc-sdk/Linux_x86_64/21.7/cuda/include)

find_library(PYTHONLIB python3.8 HINTS /lus/theta-fs0/software/thetagpu/conda/2021-11-30/mconda3/lib)

target_include_directories(burger PUBLIC ${CMAKE_SOURCE_DIR}/core)
target_link_libraries(burger
PUBLIC
libocca
${PYTHONLIB}
)

install(TARGETS burger RUNTIME DESTINATION ".")
install(DIRECTORY kernel/ DESTINATION kernel)
install(FILES python_module.py DESTINATION ".")
192 changes: 192 additions & 0 deletions couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
# Description

The goal of this implementation is to provide an example of how one can integrate a python-based, machine learning (ML) framework within a computational physics (PDE) solver. Like most GPU-enabled solvers, the physics kernel is executed on the device where critical field data resides. This implementation makes use of the [CuPY](https://cupy.dev/) framework to perform in-situ analysis on the device, thereby, avoiding the cost of data movement to host. Furthermore, this example demonstrates how to couple the ML workflow with an application that uses a performance-portability abstraction layer, namely [OCCA](https://github.com/libocca/occa), which executes physics kernels on the device for a variety backend-specific programming models (e.g. CUDA, HIP, SYCL).

## Requirements

- [OCCA](https://github.com/libocca/occa)
- cmake
- C++17 compiler
- C11 compiler
- CUDA 9 or later
- Virtual Python Environment

All of the above are provided on ThetaGPU

## Building and Running

We assume that you have cloned the repo to a suitable location. These are the steps to execute this code on ThetaGPU (interactively):

1. From the theta login node, please Login to a ThetaGPU service node
```
ssh thetagpusn1
```
2. Request an interactive session on an A100 GPU
```
qsub -A SDL_Workshop \
-q training-gpu \
-I \
-n 1 \
--attrs filesystems=home,grand,eagle \
-t 60
```
3. Set the Environment

Edit `OCCA_CACHE_DIR` in `set_OCCA_env.sh.` Then run `source set_OCCA_env.sh`. This loads modules and sets certain environment variables.

```
$ cat set_OCCA_env.sh
module load cmake
module load conda/2021-11-30
SDL=sdl_workshop/couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/
export OCCA_DIR=/lus/grand/projects/catalyst/world-shared/spatel/occa/install
export OCCA_CACHE_DIR=/path/to/$SDL
export OCCA_CXX="g++"
export OCCA_CXXFLAGS="-O3"
export OCCA_CUDA_COMPILER="nvcc"
export OCCA_CUDA_COMPILER_FLAGS="-O3 --fmad=true"
export PATH+=":${OCCA_DIR}/bin"
export LD_LIBRARY_PATH+=":${OCCA_DIR}/lib"
```
NOTE: the `OCCA_CACHE_DIR` specifies a location where OCCA-specific kernels (.okl) are stored/cached. Please make note of your path to this directory.

4. Check the environment.
```
$ module list
Currently Loaded Modules:
1) Core/StdEnv 2) cmake/3.19.5 3) openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0 4) conda/2021-11-30
```
```
$ occa info
========+======================+=================================
CPU(s) | Processor Name | AMD EPYC 7742 64-Core Processor
| Memory | 1007.69 GB
| Clock Frequency | 2.2 MHz
| SIMD Instruction Set | SSE2
| SIMD Width | 128 bits
| L1d Cache Size | 4 MB
| L1i Cache Size | 4 MB
| L2 Cache Size | 64 MB
| L3 Cache Size | 512 MB
========+======================+=================================
CUDA | Device Name | NVIDIA A100-SXM4-40GB
| Device ID | 0
| Memory | 39.58 GB
========+======================+=================================
```
5. Activate the virtual Python environment

```
$ conda activate
```
6. Build by running the `build.sh` script from this directory. This script is the cmake driver.

```
$ sh build.sh
```
You should some output that looks something like:
```
-- Build files have been written to: ...
+ cmake --build .../ML_PythonC++_Embedding/ThetaGPU_OCCA/build --parallel 4
Scanning dependencies of target burger
[ 50%] Building CXX object CMakeFiles/burger.dir/main.cpp.o
[100%] Linking CXX executable burger
[100%] Built target burger
+ cmake --install ... --prefix ...
-- Install configuration: "RelWithDebInfo"
-- Installing: .../ML_PythonC++_Embedding/ThetaGPU_OCCA/install/./burger
-- Set runtime path of ".../ThetaGPU_OCCA/install/./burger" to ""
-- Installing: .../ML_PythonC++_Embedding/ThetaGPU_OCCA/install/kernel
-- Installing: .../ML_PythonC++_Embedding/ThetaGPU_OCCA/install/kernel/burger.okl
-- Installing: .../ML_PythonC++_Embedding/ThetaGPU_OCCA/install/./python_module.py
```
7. Run

```
$ cd install/
$ ls
burger kernel python_module.py
$ ls kernel/
burger.okl
$ ./burger
Initialization of Python: Done
Within Python Module File
Loaded Python Module File: Done
Loaded Functions: Done
Called python data collection function successfully
time = 0.001
Called python data collection function successfully
Called python data collection function successfully
Called python data collection function successfully
...
...
...
Called python data collection function successfully
Called python data collection function successfully
time = 2.001
Mean Wall-Time: 0.0469978
A random value in the solution array: 0.0733873
Called python analyses function successfully
Performing SVD
Called python analyses function successfully
$ ls
burger Field_evolution.png kernel python_module.py SVD_Eigenvectors.png
```

## Key Features

### OCCA API: Host, Device & Memory
The host, usually a CPU processor, is the physical device that runs the application. A device can be a physical device which can be the host (i.e. a CPU) or an offload device - one that is physically distinct from the host. In this example, our kernels run on an offload device. OCCA enables the user to connect to the physical device through the OCCA API. For example, the following snippet is one way to instantiate the device:
```
occa::device device;
device.setup((std::string) args["options/device"]);
device.setup({
{"mode" , "CUDA"},
{"device_id", 0},
});
```
The OCCA API is also used for memory allocation. This is done using the [malloc](https://libocca.org/#/api/device/malloc) method on a device object.
```
int N = 1000;
occa::memory o_uh, o_uh_prev;
o_uh = device.malloc<double>(N);
o_uh_prev = device.malloc<double>(N);
```
To get the backend pointer, one can do:
```
double *d_b = static_cast<double *>(o_uh.ptr());
```
This exposes the address set by the backend model (e.g. CUDA) and hardware (e.g. NVIDIA A100). Doing this will be helpful as we perform in-situ analysis on data resident on the device.

### Using CuPY to enable zero-copy, in-situ analysis
In CuPY, `cupy.ndarray` is the counterpart of the NumPy `numpy.ndarray` which provides an interface for fixed-size multi-dimensional array which resides on a CUDA device. Low-level CUDA support in CuPY allows us to retreive device memory. For example,

```
import cupy
from cupy.cuda import memory
def my_function(a):
b = cupy.ndarray(
a.__array_interface__['shape'][0],
cupy.dtype(a.dtype.name),
cupy.cuda.MemoryPointer(cupy.cuda.UnownedMemory(
a.__array_interface__['data'][0], #<---Pointer?
a.size,
a,
0), 0),
strides=a.__array_interface__['strides'])
```
55 changes: 55 additions & 0 deletions couplingSimulationML/ML_PythonC++_Embedding/ThetaGPU_OCCA/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash
set -x
#-----
EXTERNAL_BLASLAPACK="OFF"
EXTERNAL_OCCA="ON"

PREFIX_PATHS=

# Default build parameters
: ${BUILD_DIR:=`pwd`/build}
: ${INSTALL_DIR:=`pwd`/install}
: ${BUILD_TYPE:="RelWithDebInfo"}

: ${CC:="gcc"}
: ${CXX:="g++"}
: ${FC:="gfortran"}

: ${MPICC:="mpicc"}
: ${MPICXX:="mpicxx"}
: ${MPIFC:="mpif77"}

: ${EXTERNAL_BLASLAPACK:="OFF"}
: ${EXTERNAL_OCCA:="ON"}

# OCCA Configuration
: ${ENABLE_DPCPP:="OFF"}
: ${ENABLE_OPENCL:="OFF"}
: ${ENABLE_CUDA:="ON"}
: ${ENABLE_HIP="OFF"}
: ${ENABLE_OPENMP="ON"}
: ${ENABLE_METAL="OFF"}
: ${ENABLE_MPI="ON"}

cmake -S . -B ${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} \
-DCMAKE_PREFIX_PATH=${PREFIX_PATHS} \
-DCMAKE_C_COMPILER=${CC} \
-DCMAKE_CXX_COMPILER=${CXX} \
-DCMAKE_Fortran_COMPILER=${FC} \
-DMPI_C_COMPILER=${MPICC} \
-DMPI_CXX_COMPILER=${MPICXX} \
-DMPI_Fortran_COMPILER=${MPIFC} \
-DEXTERNAL_BLASLAPACK=${EXTERNAL_BLASLAPACK} \
-DEXTERNAL_OCCA=${EXTERNAL_OCCA} \
-DENABLE_DPCPP=${ENABLE_DPCPP} \
-DENABLE_OPENCL=${ENABLE_OPENCL} \
-DENABLE_CUDA=${ENABLE_CUDA} \
-DENABLE_HIP=${ENABLE_HIP} \
-DENABLE_OPENMP=${ENABLE_OPENMP} \
-DENABLE_METAL=${ENABLE_METAL} \
-DENABLE_MPI=${ENABLE_MPI}

cmake --build ${BUILD_DIR} --parallel 4 && \
cmake --install ${BUILD_DIR} --prefix ${INSTALL_DIR}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
@kernel void update_burger(const int Npts,
const double c1, // inertial
const double c2, // viscous
@restrict double *u_prev,
@restrict double *RHS) {

for (int i = 0; i < Npts; ++i; @tile(4,@outer,@inner)) {

if (i > 0 && i < Npts-1 ) {
RHS[i] = u_prev[i] + c2*(u_prev[i+1] - 2.0*u_prev[i] + u_prev[i-1]) - u_prev[i]*c1*(u_prev[i+1] - u_prev[i-1]);
} else if (i == 0) {
RHS[i] = u_prev[Npts-2];
} else if (i >= Npts-1) {
RHS[i] = u_prev[1];
}

}

}
Loading

0 comments on commit d0c7296

Please sign in to comment.