From 3e0790c08e8de660f8d54819af0a488c487f248a Mon Sep 17 00:00:00 2001 From: Reguly Reguly Date: Sun, 4 Dec 2022 14:57:40 +0200 Subject: [PATCH 1/5] HIP on latest develop, working with Cray --- makefiles/c_app.mk | 20 +- makefiles/common.mk | 16 +- makefiles/compilers.mk | 12 + makefiles/compilers/c/cray.mk | 2 +- makefiles/compilers/c_hip/hip.mk | 9 + makefiles/dependencies/hip.mk | 33 ++ makefiles/dependencies/tests/hip.cpp | 7 + op2/Makefile | 39 ++ op2/include/op_hip_reduction.h | 259 +++++++++ op2/include/op_hip_rt_support.h | 137 +++++ op2/src/hip/op_hip_decl.cpp | 393 +++++++++++++ op2/src/hip/op_hip_rt_support.cpp | 555 +++++++++++++++++++ op2/src/mpi/op_mpi_hip_decl.cpp | 749 +++++++++++++++++++++++++ op2/src/mpi/op_mpi_hip_kernels.cpp | 453 +++++++++++++++ op2/src/mpi/op_mpi_hip_rt_support.cpp | 763 ++++++++++++++++++++++++++ translator/c/op2.py | 1 + translator/c/op2_gen_cuda_simple.py | 35 +- 17 files changed, 3464 insertions(+), 19 deletions(-) create mode 100644 makefiles/compilers/c_hip/hip.mk create mode 100644 makefiles/dependencies/hip.mk create mode 100644 makefiles/dependencies/tests/hip.cpp create mode 100644 op2/include/op_hip_reduction.h create mode 100644 op2/include/op_hip_rt_support.h create mode 100644 op2/src/hip/op_hip_decl.cpp create mode 100644 op2/src/hip/op_hip_rt_support.cpp create mode 100644 op2/src/mpi/op_mpi_hip_decl.cpp create mode 100644 op2/src/mpi/op_mpi_hip_kernels.cpp create mode 100644 op2/src/mpi/op_mpi_hip_rt_support.cpp diff --git a/makefiles/c_app.mk b/makefiles/c_app.mk index 8de173d26..e392cc5b8 100644 --- a/makefiles/c_app.mk +++ b/makefiles/c_app.mk @@ -9,7 +9,7 @@ APP_ENTRY_MPI_BASENAME := $(basename $(APP_ENTRY_MPI)) APP_ENTRY_OP := $(APP_ENTRY_BASENAME)_op.cpp APP_ENTRY_MPI_OP := $(APP_ENTRY_MPI_BASENAME)_op.cpp -ALL_VARIANTS := seq genseq vec openmp openmp4 cuda cuda_hyb +ALL_VARIANTS := seq genseq vec openmp openmp4 cuda cuda_hyb hip ALL_VARIANTS += $(foreach variant,$(ALL_VARIANTS),mpi_$(variant)) ALL_VARIANTS := $(foreach variant,$(ALL_VARIANTS),$(APP_NAME)_$(variant)) @@ -28,6 +28,10 @@ ifeq ($(HAVE_C),true) ifeq ($(HAVE_CUDA),true) BASE_BUILDABLE_VARIANTS += cuda cuda_hyb endif + + ifeq ($(HAVE_HIP),true) + BASE_BUILDABLE_VARIANTS += hip + endif endif BUILDABLE_VARIANTS := @@ -69,7 +73,7 @@ all: $(BUILDABLE_VARIANTS) clean: -$(RM) $(ALL_VARIANTS) - -$(RM) -r seq vec openmp openmp4 cuda openacc + -$(RM) -r seq vec openmp openmp4 cuda hip openacc -$(RM) *_op.cpp -$(RM) .generated .generated -$(RM) *.d @@ -109,6 +113,8 @@ CUDA_HYB_SRC := $(APP_ENTRY_OP) \ MPI_CUDA_HYB_SRC := $(APP_ENTRY_MPI_OP) \ cuda/$(APP_NAME)_mpi_hybkernels_cpu.o cuda/$(APP_NAME)_mpi_hybkernels_gpu.o +HIP_SRC := $(APP_ENTRY_OP) hip/$(APP_NAME)_kernels.o +MPI_HIP_SRC := $(APP_ENTRY_MPI_OP) hip/$(APP_NAME)_mpi_kernels.o # $(1) = variant name # $(2) = additional flags @@ -134,10 +140,14 @@ $(eval $(call RULE_template, openmp, $(OMP_CPPFLAGS), $(eval $(call RULE_template, openmp4, $(OMP_OFFLOAD_CPPFLAGS) -DOP2_WITH_OMP4, OPENMP4, )) $(eval $(call RULE_template, cuda,, CUDA, MPI_CUDA)) $(eval $(call RULE_template, cuda_hyb, $(OMP_CPPFLAGS), CUDA, MPI_CUDA)) +$(eval $(call RULE_template, hip,, HIP, MPI_HIP)) $(APP_NAME)_cuda: cuda/$(APP_NAME)_kernels.o $(APP_NAME)_mpi_cuda: cuda/$(APP_NAME)_mpi_kernels.o +$(APP_NAME)_hip: hip/$(APP_NAME)_kernels.o +$(APP_NAME)_mpi_hip: hip/$(APP_NAME)_mpi_kernels.o + $(APP_NAME)_cuda_hyb: cuda/$(APP_NAME)_hybkernels_gpu.o cuda/$(APP_NAME)_hybkernels_cpu.o $(APP_NAME)_mpi_cuda_hyb: cuda/$(APP_NAME)_mpi_hybkernels_gpu.o cuda/$(APP_NAME)_mpi_hybkernels_cpu.o @@ -147,6 +157,12 @@ cuda/$(APP_NAME)_kernels.o: .generated cuda/$(APP_NAME)_mpi_kernels.o: .generated $(NVCC) $(NVCCFLAGS) $(OP2_INC) -c cuda/$(APP_ENTRY_MPI_BASENAME)_kernels.cu -o $@ +hip/$(APP_NAME)_kernels.o: .generated + $(HIPCXX) $(HIPFLAGS) $(OP2_INC) -c hip/$(APP_ENTRY_BASENAME)_kernels.cpp -o $@ + +hip/$(APP_NAME)_mpi_kernels.o: .generated + $(HIPCXX) $(HIPFLAGS) $(OP2_INC) -c hip/$(APP_ENTRY_MPI_BASENAME)_kernels.cpp -o $@ + cuda/$(APP_NAME)_hybkernels_gpu.o: .generated $(NVCC) $(NVCCFLAGS) -DOP_HYBRID_GPU -DGPUPASS $(OP2_INC) \ -c cuda/$(APP_ENTRY_BASENAME)_hybkernels.cu -o $@ diff --git a/makefiles/common.mk b/makefiles/common.mk index 4de2a2f01..9e9a3d4fb 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -78,6 +78,13 @@ ifeq ($(MAKECMDGOALS),config) $(call info_bold,> C/C++ CUDA compiler $(TEXT_NOTFOUND); skipping search for CUDA libraries) endif + ifeq ($(CONFIG_HAVE_C_HIP),true) + $(call info_bold,> C/C++ HIP compiler $(TEXT_FOUND) ($(CONFIG_HIP)); looking for the HIP libraries) + include $(DEPS_DIR)/hip.mk + else + $(call info_bold,> C/C++ HIP compiler $(TEXT_NOTFOUND); skipping search for HIP libraries) + endif + $(info ) ifeq ($(CONFIG_HAVE_MPI_C),true) @@ -139,6 +146,7 @@ ifneq ($(MAKECMDGOALS),clean) $(info . C: $(if $(HAVE_C),$(CC),not found)) $(info . C++: $(if $(HAVE_C),$(CXX),not found)) $(info . CUDA: $(if $(HAVE_C_CUDA),$(NVCC),not found)) + $(info . HIP: $(if $(HAVE_C_HIP),$(HIPCXX),not found)) $(info . Fortran: $(if $(HAVE_F),$(FC),not found)) $(info ) $(info MPI compilers:) @@ -161,14 +169,15 @@ ifneq ($(MAKECMDGOALS),clean) $(info . C: $(CFLAGS)) $(info . C++: $(CXXFLAGS)) $(info . CUDA: $(NVCCFLAGS)) + $(info . HIP: $(HIPFLAGS)) $(info . Fortran: $(FFLAGS)) $(info ) endif -OP2_LIBS_SINGLE_NODE := seq cuda openmp openmp4 +OP2_LIBS_SINGLE_NODE := seq cuda hip openmp openmp4 OP2_FOR_LIBS_SINGLE_NODE := $(foreach lib,$(OP2_LIBS_SINGLE_NODE),f_$(lib)) -OP2_LIBS_MPI := mpi mpi_cuda +OP2_LIBS_MPI := mpi mpi_cuda mpi_hip OP2_FOR_LIBS_MPI := $(foreach lib,$(OP2_LIBS_MPI),f_$(lib)) OP2_LIBS := hdf5 $(OP2_LIBS_SINGLE_NODE) $(OP2_LIBS_MPI) @@ -199,3 +208,6 @@ $(foreach lib,$(OP2_LIBS_MPI),$(eval $(call OP2_LIB_template,$(lib),\ OP2_LIB_CUDA += $(CUDA_LIB) OP2_LIB_MPI_CUDA += $(CUDA_LIB) + +OP2_LIB_HIP += $(HIP_LIB) +OP2_LIB_MPI_HIP += $(HIP_LIB) \ No newline at end of file diff --git a/makefiles/compilers.mk b/makefiles/compilers.mk index 74129b4df..8dc4c672d 100644 --- a/makefiles/compilers.mk +++ b/makefiles/compilers.mk @@ -2,6 +2,7 @@ ifdef OP2_COMPILER OP2_C_COMPILER ?= $(OP2_COMPILER) OP2_F_COMPILER ?= $(OP2_COMPILER) OP2_C_CUDA_COMPILER ?= nvhpc + OP2_C_HIP_COMPILER ?= hip endif # Process CUDA_GEN and NV_ARCH until CUDA_GEN is a whitespace separated list of @@ -28,6 +29,10 @@ ifdef OP2_C_CUDA_COMPILER include $(MAKEFILES_DIR)/compilers/c_cuda/$(OP2_C_CUDA_COMPILER).mk endif +ifdef OP2_C_HIP_COMPILER + include $(MAKEFILES_DIR)/compilers/c_hip/$(OP2_C_HIP_COMPILER).mk +endif + ifdef OP2_F_COMPILER include $(MAKEFILES_DIR)/compilers/fortran/$(OP2_F_COMPILER).mk endif @@ -48,6 +53,13 @@ ifneq ($(shell which $(CONFIG_NVCC) 2> /dev/null),) CONFIG_HAVE_C_CUDA := true endif +ifneq ($(shell which $(CONFIG_HIP) 2> /dev/null),) + CONFIG_HIP != which $(CONFIG_HIP) + CONFIG_HIPCXX = $(CONFIG_HIP) + CONFIG_HIPCXX = $(CONFIG_HIP) + CONFIG_HAVE_C_HIP := true +endif + ifneq ($(shell which $(CONFIG_FC) 2> /dev/null),) CONFIG_FC != which $(CONFIG_FC) CONFIG_HAVE_F := true diff --git a/makefiles/compilers/c/cray.mk b/makefiles/compilers/c/cray.mk index 8235376c3..02671eeea 100644 --- a/makefiles/compilers/c/cray.mk +++ b/makefiles/compilers/c/cray.mk @@ -23,4 +23,4 @@ CONFIG_OMP_CPPFLAGS ?= -fopenmp CONFIG_CPP_HAS_OMP ?= true # CONFIG_OMP_OFFLOAD_CPPFLAGS ?= -CONFIG_CPP_HAS_OMP_OFFLOAD ?= false +CONFIG_CPP_HAS_OMP_OFFLOAD ?= false \ No newline at end of file diff --git a/makefiles/compilers/c_hip/hip.mk b/makefiles/compilers/c_hip/hip.mk new file mode 100644 index 000000000..d8bdf3041 --- /dev/null +++ b/makefiles/compilers/c_hip/hip.mk @@ -0,0 +1,9 @@ +CONFIG_HIP ?= hipcc + +ifndef DEBUG + HIP_OPT := -Ofast +else + HIP_OPT := -g -O0 +endif + +CONFIG_HIPFLAGS ?= -x hip --offload-arch=$(HIP_ARCH) $(HIP_OPT) diff --git a/makefiles/dependencies/hip.mk b/makefiles/dependencies/hip.mk new file mode 100644 index 000000000..8fc47235a --- /dev/null +++ b/makefiles/dependencies/hip.mk @@ -0,0 +1,33 @@ +ifdef HIP_INSTALL_PATH + HIP_INC_PATH := -I$(HIP_INSTALL_PATH)/include + HIP_LIB_PATH := -L$(HIP_INSTALL_PATH)/lib64 -L$(HIP_INSTALL_PATH)/lib +endif + +HIP_TEST = $(CONFIG_HIP) $(HIP_INC_PATH) \ + $(DEPS_DIR)/tests/hip.cpp $(HIP_LIB_PATH) $(HIP_LINK) \ + -o $(DEPS_DIR)/tests/hip + +$(file > $(DEP_BUILD_LOG),$(HIP_TEST)) +$(shell $(HIP_TEST) >> $(DEP_BUILD_LOG) 2>&1) + +ifneq ($(.SHELLSTATUS),0) + HIP_LINK ?= #-lculibos -lpthread -lrt -ldl + + $(file >> $(DEP_BUILD_LOG),$(HIP_TEST)) + $(shell $(HIP_TEST) >> $(DEP_BUILD_LOG) 2>&1) +endif + +ifeq ($(.SHELLSTATUS),0) + $(shell rm -f $(DEPS_DIR)/tests/hip) + + $(call info_bold, > HIP libraries $(TEXT_FOUND) (link flags: $(or $(HIP_LINK), none))) + + CONFIG_HAVE_HIP := true + + CONFIG_HIP_INC := $(strip $(HIP_INC_PATH) $(HIP_DEF)) + CONFIG_HIP_LIB := $(strip $(HIP_LIB_PATH) $(HIP_LINK)) +else + $(call info_bold, > HIP libraries $(TEXT_NOTFOUND):) + $(info $(file < $(DEP_BUILD_LOG))) + $(info ) +endif diff --git a/makefiles/dependencies/tests/hip.cpp b/makefiles/dependencies/tests/hip.cpp new file mode 100644 index 000000000..daff0a814 --- /dev/null +++ b/makefiles/dependencies/tests/hip.cpp @@ -0,0 +1,7 @@ +#include + +int main() { + int deviceCount = 0; + auto err = hipGetDeviceCount(&deviceCount); + return err; +} diff --git a/op2/Makefile b/op2/Makefile index 37f418acc..c7ff8ac3b 100644 --- a/op2/Makefile +++ b/op2/Makefile @@ -32,6 +32,14 @@ ifeq ($(HAVE_C),true) endif endif + ifeq ($(HAVE_HIP),true) + OP2_BUILDABLE_LIBS += hip + + ifeq ($(HIP_IS_BUILDABLE),true) + OP2_BUILDABLE_LIBS += mpi_hip + endif + endif + ifeq ($(MPI_IS_BUILDABLE),true) OP2_BUILDABLE_LIBS += mpi endif @@ -110,6 +118,10 @@ OP2_CUDA := $(OP2_BASE) $(addprefix $(OBJ)/,\ cuda/op_cuda_decl.o \ cuda/op_cuda_rt_support.o) +OP2_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\ + hip/op_hip_decl.o \ + hip/op_hip_rt_support.o) + OP2_FOR_CUDA := $(OP2_CUDA) $(OP2_FOR_BASE_CUDA) $(addprefix $(OBJ)/fortran/,\ op2_for_rt_wrappers_cuda.o \ cudaConfigurationParams.o) @@ -153,6 +165,18 @@ OP2_MPI_CUDA := $(OP2_BASE) $(addprefix $(OBJ)/,\ externlib/op_util.o \ externlib/op_renumber.o) +OP2_MPI_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\ + hip/op_hip_rt_support+mpi.o \ + mpi/op_mpi_core.o \ + mpi/op_mpi_part_core.o \ + mpi/op_mpi_hip_decl.o \ + mpi/op_mpi_hip_rt_support.o \ + mpi/op_mpi_hip_kernels.o \ + mpi/op_mpi_hdf5.o \ + mpi/op_mpi_util.o \ + externlib/op_util.o \ + externlib/op_renumber.o) + OP2_FOR_MPI_CUDA := $(OP2_MPI_CUDA) $(OP2_FOR_BASE_MPI_CUDA) $(addprefix $(OBJ)/fortran/,\ op2_for_rt_wrappers_cuda.o \ cudaConfigurationParams.o) @@ -186,6 +210,12 @@ $(OBJ)/cuda/%+mpi.o: src/cuda/%.cpp | $(OBJ) $(OBJ)/cuda/%.o: src/cuda/%.cpp | $(OBJ) $(CXX) $(CXXFLAGS) $(INC) -DSET_CUDA_CACHE_CONFIG -c $< -o $@ +$(OBJ)/hip/%+mpi.o: src/hip/%.cpp | $(OBJ) + $(MPICXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -DOPMPI -c $< -o $@ + +$(OBJ)/hip/%.o: src/hip/%.cpp | $(OBJ) + $(CXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -c $< -o $@ + $(OBJ)/openmp4/%.o: src/openmp4/%.cpp | $(OBJ) $(CXX) $(CXXFLAGS) $(OMP_OFFLOAD_CXXFLAGS) $(INC) -c $< -o $@ @@ -195,6 +225,15 @@ $(OBJ)/mpi/%.o: src/mpi/%.cpp | $(OBJ) $(OBJ)/mpi/%.o: src/mpi/%.cu | $(OBJ) $(NVCC) $(NVCCFLAGS) $(INC) -c $< -o $@ +$(OBJ)/mpi/op_mpi_hip_decl.o: src/mpi/op_mpi_hip_decl.cpp | $(OBJ) + $(MPICXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -c $< -o $@ + +$(OBJ)/mpi/op_mpi_hip_rt_support.o: src/mpi/op_mpi_hip_rt_support.cpp | $(OBJ) + $(MPICXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -c $< -o $@ + +$(OBJ)/mpi/op_mpi_hip_kernels.o: src/mpi/op_mpi_hip_kernels.cpp | $(OBJ) + $(MPICXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -c $< -o $@ + $(OBJ)/fortran/%+mpi.o: src/fortran/%.c | $(OBJ) $(MPICC) $(CFLAGS) $(INC) -DOPMPI -c $< -o $@ diff --git a/op2/include/op_hip_reduction.h b/op2/include/op_hip_reduction.h new file mode 100644 index 000000000..bc21a66e7 --- /dev/null +++ b/op2/include/op_hip_reduction.h @@ -0,0 +1,259 @@ +#include "hip/hip_runtime.h" +/* + * Open source copyright declaration based on BSD open source template: + * http://www.opensource.org/licenses/bsd-license.php + * + * This file is part of the OP2 distribution. + * + * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in + * the main source directory for a full list of copyright holders. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * The name of Mike Giles may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OP_HIP_REDUCTION_H +#define __OP_HIP_REDUCTION_H + +/* + * This file provides an optimised implementation for reduction of OP2 global + * variables. + * It is separated from the op_cuda_rt_support.h file because the reduction code + * is based on C++ templates, while the other file only includes C routines. + */ + +#include + +/* + * reduction routine for arbitrary datatypes + */ + +template +__inline__ __device__ void op_reduction(volatile T *dat_g, T dat_l) { + extern __shared__ volatile double temp2[]; + __shared__ volatile T *temp; + temp = (T *)temp2; + T dat_t; + + __syncthreads(); /* important to finish all previous activity */ + + int tid = threadIdx.x; + temp[tid] = dat_l; + + // first, cope with blockDim.x perhaps not being a power of 2 + + __syncthreads(); + + int d = 1 << (31 - __clz(((int)blockDim.x - 1))); + // d = blockDim.x/2 rounded up to nearest power of 2 + + if (tid + d < blockDim.x) { + dat_t = temp[tid + d]; + + switch (reduction) { + case OP_INC: + dat_l = dat_l + dat_t; + break; + case OP_MIN: + if (dat_t < dat_l) + dat_l = dat_t; + break; + case OP_MAX: + if (dat_t > dat_l) + dat_l = dat_t; + break; + } + + temp[tid] = dat_l; + } + + // second, do reductions involving more than one warp + + for (d >>= 1; d > warpSize; d >>= 1) { + __syncthreads(); + + if (tid < d) { + dat_t = temp[tid + d]; + + switch (reduction) { + case OP_INC: + dat_l = dat_l + dat_t; + break; + case OP_MIN: + if (dat_t < dat_l) + dat_l = dat_t; + break; + case OP_MAX: + if (dat_t > dat_l) + dat_l = dat_t; + break; + } + + temp[tid] = dat_l; + } + } + + // third, do reductions involving just one warp + + __syncthreads(); + + if (tid < warpSize) { + for (; d > 0; d >>= 1) { + // __syncwarp(); + if (tid < d) { + dat_t = temp[tid + d]; + + switch (reduction) { + case OP_INC: + dat_l = dat_l + dat_t; + break; + case OP_MIN: + if (dat_t < dat_l) + dat_l = dat_t; + break; + case OP_MAX: + if (dat_t > dat_l) + dat_l = dat_t; + break; + } + + temp[tid] = dat_l; + } + } + + // finally, update global reduction variable + + if (tid == 0) { + switch (reduction) { + case OP_INC: + *dat_g = *dat_g + dat_l; + break; + case OP_MIN: + if (dat_l < *dat_g) + *dat_g = dat_l; + break; + case OP_MAX: + if (dat_l > *dat_g) + *dat_g = dat_l; + break; + } + } + } +} + +/* + * reduction routine for arbitrary datatypes + * (alternative version using just one warp) + * + */ + +template +__inline__ __device__ void op_reduction_alt(volatile T *dat_g, T dat_l) { + extern __shared__ volatile double temp2[]; + __shared__ volatile T *temp; + temp = (T *)temp2; + T dat_t; + + __syncthreads(); /* important to finish all previous activity */ + + int tid = threadIdx.x; + temp[tid] = dat_l; + + __syncthreads(); + + // set number of active threads + + int d = warpSize; + + if (blockDim.x < warpSize) + d = 1 << (31 - __clz((int)blockDim.x)); + // this gives blockDim.x rounded down to nearest power of 2 + + if (tid < d) { + + // first, do reductions for each thread + + for (int t = tid + d; t < blockDim.x; t += d) { + dat_t = temp[t]; + + switch (reduction) { + case OP_INC: + dat_l = dat_l + dat_t; + break; + case OP_MIN: + if (dat_t < dat_l) + dat_l = dat_t; + break; + case OP_MAX: + if (dat_t > dat_l) + dat_l = dat_t; + break; + } + } + + temp[tid] = dat_l; + + // second, do reductions to combine thread reductions + + for (d >>= 1; d > 0; d >>= 1) { + if (tid < d) { + dat_t = temp[tid + d]; + + switch (reduction) { + case OP_INC: + dat_l = dat_l + dat_t; + break; + case OP_MIN: + if (dat_t < dat_l) + dat_l = dat_t; + break; + case OP_MAX: + if (dat_t > dat_l) + dat_l = dat_t; + break; + } + + temp[tid] = dat_l; + } + } + + // finally, update global reduction variable + + if (tid == 0) { + switch (reduction) { + case OP_INC: + *dat_g = *dat_g + dat_l; + break; + case OP_MIN: + if (dat_l < *dat_g) + *dat_g = dat_l; + break; + case OP_MAX: + if (dat_l > *dat_g) + *dat_g = dat_l; + break; + } + } + } +} + +#endif /* __OP_HIP_REDUCTION_H */ diff --git a/op2/include/op_hip_rt_support.h b/op2/include/op_hip_rt_support.h new file mode 100644 index 000000000..78eb42908 --- /dev/null +++ b/op2/include/op_hip_rt_support.h @@ -0,0 +1,137 @@ +/* + * Open source copyright declaration based on BSD open source template: + * http://www.opensource.org/licenses/bsd-license.php + * + * This file is part of the OP2 distribution. + * + * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in + * the main source directory for a full list of copyright holders. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * The name of Mike Giles may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OP_HIP_RT_SUPPORT_H +#define __OP_HIP_RT_SUPPORT_H + +/* + * This header file declares the HIP back-end specific run-time functions + * to be used by the code generated by OP2 compiler. + */ + +#include +#include +#include + +#include +#include + +/* define HIP warpsize for OP2 */ + +#define OP_WARPSIZE 32 + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Global variables actually defined in the corresponding c file + */ +extern char *OP_consts_h, *OP_consts_d, *OP_reduct_h, *OP_reduct_d; + +extern void __syncthreads(); + +extern hipStream_t op2_grp_secondary; + +/* + * personal stripped-down version of cutil_inline.h + */ + +#define cutilSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__) +#define cutilCheckMsg(msg) __cutilCheckMsg(msg, __FILE__, __LINE__) + +void __cudaSafeCall(hipError_t err, const char *file, const int line); + +void __cutilCheckMsg(const char *errorMessage, const char *file, + const int line); + +void cutilDeviceInit(int argc, char **argv); + +void cutilDeviceInit_mpi(int argc, char **argv, int mpi_rank); + +/* + * routines to move arrays to/from GPU device + */ + +void op_mvHostToDevice(void **map, int size); + +void op_cpHostToDevice(void **data_d, void **data_h, int size); + +void op_cuda_get_data(op_dat dat); + +/* + * Plan interface for generated code: the implementation changes depending + * on the actual back-end libraries (e.g. cuda or openmp) and it is + * hence declared here for cuda. + * To avoid linking various header files together, the design requires the + * declaration of this function as back-end specific, while the common + * op_rt_support.h header file only declares the low-level functions + * (e.g. op_plan_core) + */ +op_plan *op_plan_get(char const *name, op_set set, int part_size, int nargs, + op_arg *args, int ninds, int *inds); + +op_plan *op_plan_get_stage(char const *name, op_set set, int part_size, + int nargs, op_arg *args, int ninds, int *inds, + int staging); + +op_plan *op_plan_get_stage_upload(char const *name, op_set set, int part_size, + int nargs, op_arg *args, int ninds, int *inds, + int staging, int upload); + +void op_cuda_exit(); + +/* + * routines to resize constant/reduct arrays, if necessary + */ + +void reallocConstArrays(int consts_bytes); + +void reallocReductArrays(int reduct_bytes); + +/* + * routines to move constant/reduct arrays + */ + +void mvConstArraysToDevice(int consts_bytes); + +void mvConstArraysToHost(int consts_bytes); + +void mvReductArraysToDevice(int reduct_bytes); + +void mvReductArraysToHost(int reduct_bytes); + +#ifdef __cplusplus +} +#endif + +#endif /* __OP_HIP_RT_SUPPORT_H */ diff --git a/op2/src/hip/op_hip_decl.cpp b/op2/src/hip/op_hip_decl.cpp new file mode 100644 index 000000000..e3b3dc586 --- /dev/null +++ b/op2/src/hip/op_hip_decl.cpp @@ -0,0 +1,393 @@ +/* + * Open source copyright declaration based on BSD open source template: + * http://www.opensource.org/licenses/bsd-license.php + * + * This file is part of the OP2 distribution. + * + * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in + * the main source directory for a full list of copyright holders. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * The name of Mike Giles may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// +// This file implements the OP2 user-level functions for the CUDA backend +// + +#include +#include +#include + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// +// CUDA-specific OP2 functions +// + +void op_init_soa(int argc, char **argv, int diags, int soa) { + OP_auto_soa = soa; + op_init(argc, argv, diags); +} + +void op_init(int argc, char **argv, int diags) { + op_init_core(argc, argv, diags); + + cutilDeviceInit(argc, argv); + +// +// The following call is only made in the C version of OP2, +// as it causes memory trashing when called from Fortran. +// \warning add -DSET_CUDA_CACHE_CONFIG to compiling line +// for this file when implementing C OP2. +// + +#ifdef SET_CUDA_CACHE_CONFIG + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); +#endif + + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); + cutilSafeCall(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte)); + printf("\n 16/48 L1/shared \n"); +} + +void op_mpi_init(int argc, char **argv, int diags, int global, int local) { + op_init_core(argc, argv, diags); + + + cutilDeviceInit(argc, argv); + +// +// The following call is only made in the C version of OP2, +// as it causes memory trashing when called from Fortran. +// \warning add -DSET_CUDA_CACHE_CONFIG to compiling line +// for this file when implementing C OP2. +// + +#ifdef SET_CUDA_CACHE_CONFIG + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); +#endif + + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); + cutilSafeCall(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte)); + printf("\n 16/48 L1/shared \n"); +} + +void op_mpi_init_soa(int argc, char **argv, int diags, int global, int local, + int soa) { + OP_auto_soa = soa; + op_mpi_init(argc, argv, diags, global, local); +} + +op_dat op_decl_dat_char(op_set set, int dim, char const *type, int size, + char *data, char const *name) { + op_dat dat = op_decl_dat_core(set, dim, type, size, data, name); + + // transpose data + size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; + if (strstr(type, ":soa") != NULL || (OP_auto_soa && dim > 1)) { + char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); + int element_size = dat->size / dat->dim; + for (int i = 0; i < dat->dim; i++) { + for (int j = 0; j < set_size; j++) { + for (int c = 0; c < element_size; c++) { + temp_data[element_size * i * set_size + element_size * j + c] = + data[dat->size * j + element_size * i + c]; + } + } + } + op_cpHostToDevice((void **)&(dat->data_d), (void **)&(temp_data), + dat->size * set_size); + free(temp_data); + } else { + op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data), + dat->size * set_size); + } + + return dat; +} + +op_dat op_decl_dat_temp_char(op_set set, int dim, char const *type, int size, + char const *name) { + char *data = NULL; + op_dat dat = op_decl_dat_temp_core(set, dim, type, size, data, name); + + for (size_t i = 0; i < set->size * dim * size; i++) + dat->data[i] = 0; + dat->user_managed = 0; + + op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data), + dat->size * set->size); + + return dat; +} + +int op_free_dat_temp_char(op_dat dat) { + // free data on device + cutilSafeCall(hipFree(dat->data_d)); + + return op_free_dat_temp_core(dat); +} + +op_set op_decl_set(int size, char const *name) { + return op_decl_set_core(size, name); +} + +op_map op_decl_map(op_set from, op_set to, int dim, int *imap, + char const *name) { + op_map map = op_decl_map_core(from, to, dim, imap, name); + int set_size = map->from->size + map->from->exec_size; + int *temp_map = (int *)malloc(map->dim * set_size * sizeof(int)); + for (int i = 0; i < map->dim; i++) { + for (int j = 0; j < set_size; j++) { + temp_map[i * set_size + j] = map->map[map->dim * j + i]; + } + } + op_cpHostToDevice((void **)&(map->map_d), (void **)&(temp_map), + map->dim * set_size * sizeof(int)); + free(temp_map); + return map; +} + +op_arg op_arg_dat(op_dat dat, int idx, op_map map, int dim, char const *type, + op_access acc) { + return op_arg_dat_core(dat, idx, map, dim, type, acc); +} + +op_arg op_opt_arg_dat(int opt, op_dat dat, int idx, op_map map, int dim, + char const *type, op_access acc) { + return op_opt_arg_dat_core(opt, dat, idx, map, dim, type, acc); +} + +op_arg op_arg_gbl_char(char *data, int dim, const char *type, int size, + op_access acc) { + return op_arg_gbl_core(1, data, dim, type, size, acc); +} + +op_arg op_opt_arg_gbl_char(int opt, char *data, int dim, const char *type, + int size, op_access acc) { + return op_arg_gbl_core(opt, data, dim, type, size, acc); +} + +// +// This function is defined in the generated master kernel file +// so that it is possible to check on the runtime size of the +// data in cases where it is not known at compile time +// + +/* +void +op_decl_const_char ( int dim, char const * type, int size, char * dat, + char const * name ) +{ + cutilSafeCall ( hipMemcpyToSymbol(HIP_SYMBOL(name), dat, dim * size, 0, + hipMemcpyHostToDevice ) ); +} +*/ + +int op_get_size(op_set set) { return set->size; } + +void op_printf(const char *format, ...) { + va_list argptr; + va_start(argptr, format); + vprintf(format, argptr); + va_end(argptr); +} + +void op_print(const char *line) { printf("%s\n", line); } + +void op_timers(double *cpu, double *et) { op_timers_core(cpu, et); } + +int getSetSizeFromOpArg(op_arg *arg) { + return arg->opt ? arg->dat->set->size : 0; +} + +void op_renumber(op_map base) { (void)base; } + +void op_renumber_ptr(int *ptr){}; + +int getHybridGPU() { return OP_hybrid_gpu; } + +void op_exit() { + op_cuda_exit(); // frees dat_d memory + op_rt_exit(); // frees plan memory + op_exit_core(); // frees lib core variables +} + +void op_timing_output() { + op_timing_output_core(); + printf("Total plan time: %8.4f\n", OP_plan_time); +} + +void op_timings_to_csv(const char *outputFileName) { + FILE *outputFile = fopen(outputFileName, "w"); + if (outputFile == NULL) { + printf("ERROR: Failed to open file for writing: '%s'\n", outputFileName); + } + else { + fprintf(outputFile, "rank,thread,nranks,nthreads,count,total time,plan time,mpi time,GB used,GB total,kernel name\n"); + } + + if (outputFile != NULL) { + for (int n = 0; n < OP_kern_max; n++) { + if (OP_kernels[n].count > 0) { + if (OP_kernels[n].ntimes == 1 && OP_kernels[n].times[0] == 0.0f && + OP_kernels[n].time != 0.0f) { + // This library is being used by an OP2 translation made with the + // older + // translator with older timing logic. Adjust to new logic: + OP_kernels[n].times[0] = OP_kernels[n].time; + } + + for (int thr=0; thrdat; + size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; + if (dat->data_d) { + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); + int element_size = dat->size / dat->dim; + for (int i = 0; i < dat->dim; i++) { + for (int j = 0; j < set_size; j++) { + for (int c = 0; c < element_size; c++) { + temp_data[element_size * i * set_size + element_size * j + c] = + dat->data[dat->size * j + element_size * i + c]; + } + } + } + cutilSafeCall(hipMemcpy(dat->data_d, temp_data, dat->size * set_size, + hipMemcpyHostToDevice)); + dat->dirty_hd = 0; + free(temp_data); + } else { + cutilSafeCall(hipMemcpy(dat->data_d, dat->data, dat->size * set_size, + hipMemcpyHostToDevice)); + dat->dirty_hd = 0; + } + } + } +} + +void op_fetch_data_char(op_dat dat, char *usr_ptr) { + op_cuda_get_data(dat); + // need to copy data into memory pointed to by usr_ptr + memcpy((void *)usr_ptr, (void *)dat->data, dat->set->size * dat->size); +} + +void op_fetch_data_idx_char(op_dat dat, char *usr_ptr, int low, int high) { + op_cuda_get_data(dat); + if (low < 0 || high > dat->set->size - 1) { + printf("op_fetch_data: Indices not within range of elements held in %s\n", + dat->name); + exit(2); + } + // need to copy data into memory pointed to by usr_ptr + memcpy((void *)usr_ptr, (void *)&dat->data[low * dat->size], + (high + 1) * dat->size); +} + +// Dummy for cuda compile + +typedef struct { +} op_export_core; + +typedef op_export_core *op_export_handle; + +typedef struct { +} op_import_core; + +typedef op_import_core *op_import_handle; + +op_import_handle op_import_init_size(int nprocs, int *proclist, op_dat mark) { + + exit(1); +} + +op_import_handle op_import_init(op_export_handle exp_handle, op_dat coords, + op_dat mark) { + + exit(1); +} + +op_export_handle op_export_init(int nprocs, int *proclist, op_map cellsToNodes, + op_set sp_nodes, op_dat coords, op_dat mark) { + + exit(1); +} + +void op_theta_init(op_export_handle handle, int *bc_id, double *dtheta_exp, + double *dtheta_imp, double *alpha) { + + exit(1); +} + +void op_inc_theta(op_export_handle handle, int *bc_id, double *dtheta_exp, + double *dtheta_imp) { + + exit(1); +} + +void op_export_data(op_export_handle handle, op_dat dat) { exit(1); } + +void op_import_data(op_import_handle handle, op_dat dat) { exit(1); } + +#ifdef __cplusplus +} +#endif diff --git a/op2/src/hip/op_hip_rt_support.cpp b/op2/src/hip/op_hip_rt_support.cpp new file mode 100644 index 000000000..c0ae1e637 --- /dev/null +++ b/op2/src/hip/op_hip_rt_support.cpp @@ -0,0 +1,555 @@ +/* + * Open source copyright declaration based on BSD open source template: + * http://www.opensource.org/licenses/bsd-license.php + * + * This file is part of the OP2 distribution. + * + * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in + * the main source directory for a full list of copyright holders. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * The name of Mike Giles may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// +// This file implements the CUDA-specific run-time support functions +// + +// +// header files +// + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +// Small re-declaration to avoid using struct in the C version. +// This is due to the different way in which C and C++ see structs + +typedef struct hipDeviceProp_t cudaDeviceProp_t; + + +// arrays for global constants and reductions + +int OP_consts_bytes = 0, OP_reduct_bytes = 0; + +char *OP_consts_h, *OP_consts_d, *OP_reduct_h, *OP_reduct_d; + +// +// CUDA utility functions +// + +#ifdef __cplusplus +extern "C" { +#endif + +void __cudaSafeCall(hipError_t err, const char *file, const int line) { + if (hipSuccess != err) { + fprintf(stderr, "%s(%i) : cutilSafeCall() Runtime API error : %s.\n", file, + line, hipGetErrorString(err)); + exit(-1); + } +} + +void __cutilCheckMsg(const char *errorMessage, const char *file, + const int line) { + hipError_t err = hipGetLastError(); + if (hipSuccess != err) { + fprintf(stderr, "%s(%i) : cutilCheckMsg() error : %s : %s.\n", file, line, + errorMessage, hipGetErrorString(err)); + exit(-1); + } +} + +// +// routines to move arrays to/from GPU device +// + +void op_mvHostToDevice(void **map, int size) { + if (!OP_hybrid_gpu || size == 0) + return; + void *tmp; + cutilSafeCall(hipMalloc(&tmp, size)); + cutilSafeCall(hipMemcpy(tmp, *map, size, hipMemcpyHostToDevice)); + cutilSafeCall(hipDeviceSynchronize()); + free(*map); + *map = tmp; +} + +void op_cpHostToDevice(void **data_d, void **data_h, int size) { + if (!OP_hybrid_gpu) + return; + if (*data_d != NULL) cutilSafeCall(hipFree(*data_d)); + cutilSafeCall(hipMalloc(data_d, size)); + cutilSafeCall(hipMemcpy(*data_d, *data_h, size, hipMemcpyHostToDevice)); + cutilSafeCall(hipDeviceSynchronize()); +} + +op_plan *op_plan_get(char const *name, op_set set, int part_size, int nargs, + op_arg *args, int ninds, int *inds) { + return op_plan_get_stage(name, set, part_size, nargs, args, ninds, inds, + OP_STAGE_ALL); +} + +op_plan *op_plan_get_stage(char const *name, op_set set, int part_size, + int nargs, op_arg *args, int ninds, int *inds, + int staging) { + return op_plan_get_stage_upload(name, set, part_size, nargs, args, ninds, inds, + staging, 1); +} + +op_plan *op_plan_get_stage_upload(char const *name, op_set set, int part_size, + int nargs, op_arg *args, int ninds, int *inds, + int staging, int upload) { + op_plan *plan = + op_plan_core(name, set, part_size, nargs, args, ninds, inds, staging); + if (!OP_hybrid_gpu || !upload) + return plan; + + int set_size = set->size; + for (int i = 0; i < nargs; i++) { + if (args[i].idx != -1 && args[i].acc != OP_READ) { + set_size += set->exec_size; + break; + } + } + + if (plan->count == 1) { + int *offsets = (int *)malloc((plan->ninds_staged + 1) * sizeof(int)); + offsets[0] = 0; + for (int m = 0; m < plan->ninds_staged; m++) { + int count = 0; + for (int m2 = 0; m2 < nargs; m2++) + if (plan->inds_staged[m2] == m) + count++; + offsets[m + 1] = offsets[m] + count; + } + op_mvHostToDevice((void **)&(plan->ind_map), + offsets[plan->ninds_staged] * set_size * sizeof(int)); + for (int m = 0; m < plan->ninds_staged; m++) { + plan->ind_maps[m] = &plan->ind_map[set_size * offsets[m]]; + } + free(offsets); + + int counter = 0; + for (int m = 0; m < nargs; m++) + if (plan->loc_maps[m] != NULL) + counter++; + op_mvHostToDevice((void **)&(plan->loc_map), + sizeof(short) * counter * set_size); + counter = 0; + for (int m = 0; m < nargs; m++) + if (plan->loc_maps[m] != NULL) { + plan->loc_maps[m] = &plan->loc_map[set_size * counter]; + counter++; + } + + op_mvHostToDevice((void **)&(plan->ind_sizes), + sizeof(int) * plan->nblocks * plan->ninds_staged); + op_mvHostToDevice((void **)&(plan->ind_offs), + sizeof(int) * plan->nblocks * plan->ninds_staged); + op_mvHostToDevice((void **)&(plan->nthrcol), sizeof(int) * plan->nblocks); + op_mvHostToDevice((void **)&(plan->thrcol), sizeof(int) * set_size); + op_mvHostToDevice((void **)&(plan->col_reord), sizeof(int) * set_size); + op_mvHostToDevice((void **)&(plan->offset), sizeof(int) * plan->nblocks); + plan->offset_d = plan->offset; + op_mvHostToDevice((void **)&(plan->nelems), sizeof(int) * plan->nblocks); + plan->nelems_d = plan->nelems; + op_mvHostToDevice((void **)&(plan->blkmap), sizeof(int) * plan->nblocks); + plan->blkmap_d = plan->blkmap; + } + + return plan; +} + +void op_cuda_exit() { + if (!OP_hybrid_gpu) + return; + op_dat_entry *item; + TAILQ_FOREACH(item, &OP_dat_list, entries) { + cutilSafeCall(hipFree((item->dat)->data_d)); + } + + for (int ip = 0; ip < OP_plan_index; ip++) { + OP_plans[ip].ind_map = NULL; + OP_plans[ip].loc_map = NULL; + OP_plans[ip].ind_sizes = NULL; + OP_plans[ip].ind_offs = NULL; + OP_plans[ip].nthrcol = NULL; + OP_plans[ip].thrcol = NULL; + OP_plans[ip].col_reord = NULL; + OP_plans[ip].offset = NULL; + OP_plans[ip].nelems = NULL; + OP_plans[ip].blkmap = NULL; + } + // hipDeviceReset ( ); +} + +// +// routines to resize constant/reduct arrays, if necessary +// + +void reallocConstArrays(int consts_bytes) { + if (consts_bytes > OP_consts_bytes) { + if (OP_consts_bytes > 0) { + free(OP_consts_h); + cutilSafeCall(hipFree(OP_consts_d)); + } + OP_consts_bytes = 4 * consts_bytes; // 4 is arbitrary, more than needed + OP_consts_h = (char *)malloc(OP_consts_bytes); + cutilSafeCall(hipMalloc((void **)&OP_consts_d, OP_consts_bytes)); + } +} + +void reallocReductArrays(int reduct_bytes) { + if (reduct_bytes > OP_reduct_bytes) { + if (OP_reduct_bytes > 0) { + free(OP_reduct_h); + cutilSafeCall(hipFree(OP_reduct_d)); + } + OP_reduct_bytes = 4 * reduct_bytes; // 4 is arbitrary, more than needed + OP_reduct_h = (char *)malloc(OP_reduct_bytes); + cutilSafeCall(hipMalloc((void **)&OP_reduct_d, OP_reduct_bytes)); + } +} + +// +// routines to move constant/reduct arrays +// + +void mvConstArraysToDevice(int consts_bytes) { + cutilSafeCall(hipMemcpy(OP_consts_d, OP_consts_h, consts_bytes, + hipMemcpyHostToDevice)); + cutilSafeCall(hipDeviceSynchronize()); +} + +void mvConstArraysToHost(int consts_bytes) { + cutilSafeCall(hipMemcpy(OP_consts_h, OP_consts_d, consts_bytes, + hipMemcpyDeviceToHost)); + cutilSafeCall(hipDeviceSynchronize()); +} + +void mvReductArraysToDevice(int reduct_bytes) { + cutilSafeCall(hipMemcpy(OP_reduct_d, OP_reduct_h, reduct_bytes, + hipMemcpyHostToDevice)); + cutilSafeCall(hipDeviceSynchronize()); +} + +void mvReductArraysToHost(int reduct_bytes) { + cutilSafeCall(hipMemcpy(OP_reduct_h, OP_reduct_d, reduct_bytes, + hipMemcpyDeviceToHost)); + cutilSafeCall(hipDeviceSynchronize()); +} + +// +// routine to fetch data from GPU to CPU (with transposing SoA to AoS if needed) +// + +void op_cuda_get_data(op_dat dat) { + if (!OP_hybrid_gpu) + return; + if (dat->dirty_hd == 2) + dat->dirty_hd = 0; + else + return; + // transpose data + size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); + cutilSafeCall(hipMemcpy(temp_data, dat->data_d, dat->size * set_size, + hipMemcpyDeviceToHost)); + cutilSafeCall(hipDeviceSynchronize()); + int element_size = dat->size / dat->dim; + for (int i = 0; i < dat->dim; i++) { + for (int j = 0; j < set_size; j++) { + for (int c = 0; c < element_size; c++) { + dat->data[dat->size * j + element_size * i + c] = + temp_data[element_size * i * set_size + element_size * j + + c]; + } + } + } + free(temp_data); + } else { + cutilSafeCall(hipMemcpy(dat->data, dat->data_d, dat->size * set_size, + hipMemcpyDeviceToHost)); + cutilSafeCall(hipDeviceSynchronize()); + } +} + +void deviceSync() { + cutilSafeCall(hipDeviceSynchronize()); +} + +#ifndef OPMPI + +void cutilDeviceInit(int argc, char **argv) { + (void)argc; + (void)argv; + int deviceCount; + cutilSafeCall(hipGetDeviceCount(&deviceCount)); + if (deviceCount == 0) { + printf("cutil error: no devices supporting CUDA\n"); + exit(-1); + } + + // Test we have access to a device + float *test; + hipError_t err = hipMalloc((void **)&test, sizeof(float)); + if (err != hipSuccess) { + OP_hybrid_gpu = 0; + } else { + OP_hybrid_gpu = 1; + } + if (OP_hybrid_gpu) { + hipFree(test); + + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferL1)); + + int deviceId = -1; + hipGetDevice(&deviceId); + cudaDeviceProp_t deviceProp; + cutilSafeCall(hipGetDeviceProperties(&deviceProp, deviceId)); + printf("\n Using CUDA device: %d %s\n", deviceId, deviceProp.name); + } else { + printf("\n Using CPU\n"); + } +} + +void op_upload_dat(op_dat dat) { + if (!OP_hybrid_gpu) + return; + size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); + int element_size = dat->size / dat->dim; + for (int i = 0; i < dat->dim; i++) { + for (int j = 0; j < set_size; j++) { + for (int c = 0; c < element_size; c++) { + temp_data[element_size * i * set_size + element_size * j + c] = + dat->data[dat->size * j + element_size * i + c]; + } + } + } + cutilSafeCall(hipMemcpy(dat->data_d, temp_data, set_size * dat->size, + hipMemcpyHostToDevice)); + free(temp_data); + } else { + cutilSafeCall(hipMemcpy(dat->data_d, dat->data, set_size * dat->size, + hipMemcpyHostToDevice)); + } +} + +void op_download_dat(op_dat dat) { + if (!OP_hybrid_gpu) + return; + size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); + cutilSafeCall(hipMemcpy(temp_data, dat->data_d, set_size * dat->size, + hipMemcpyDeviceToHost)); + int element_size = dat->size / dat->dim; + for (int i = 0; i < dat->dim; i++) { + for (int j = 0; j < set_size; j++) { + for (int c = 0; c < element_size; c++) { + dat->data[dat->size * j + element_size * i + c] = + temp_data[element_size * i * set_size + element_size * j + c]; + } + } + } + free(temp_data); + } else { + cutilSafeCall(hipMemcpy(dat->data, dat->data_d, set_size * dat->size, + hipMemcpyDeviceToHost)); + } +} + +int op_mpi_halo_exchanges(op_set set, int nargs, op_arg *args) { + for (int n = 0; n < nargs; n++) + if (args[n].opt && args[n].argtype == OP_ARG_DAT && + args[n].dat->dirty_hd == 2) { + op_download_dat(args[n].dat); + args[n].dat->dirty_hd = 0; + } + return set->size; +} + +int op_mpi_halo_exchanges_grouped(op_set set, int nargs, op_arg *args, int device){ + (void)device; + return device == 1 ? op_mpi_halo_exchanges(set, nargs, args) : op_mpi_halo_exchanges_cuda(set, nargs, args); +} + +void op_mpi_set_dirtybit(int nargs, op_arg *args) { + for (int n = 0; n < nargs; n++) { + if ((args[n].opt == 1) && (args[n].argtype == OP_ARG_DAT) && + (args[n].acc == OP_INC || args[n].acc == OP_WRITE || + args[n].acc == OP_RW)) { + args[n].dat->dirty_hd = 1; + } + } +} + +void op_mpi_wait_all(int nargs, op_arg *args) { + (void)nargs; + (void)args; +} + +void op_mpi_wait_all_grouped(int nargs, op_arg *args, int device) { + (void)device; + (void)nargs; + (void)args; +} + +void op_mpi_test_all(int nargs, op_arg *args) { + (void)nargs; + (void)args; +} + +void op_mpi_test_all_grouped(int nargs, op_arg *args) { + (void)nargs; + (void)args; +} + +int op_mpi_halo_exchanges_cuda(op_set set, int nargs, op_arg *args) { + for (int n = 0; n < nargs; n++) + if (args[n].opt && args[n].argtype == OP_ARG_DAT && + args[n].dat->dirty_hd == 1) { + op_upload_dat(args[n].dat); + args[n].dat->dirty_hd = 0; + } + return set->size; +} + +void op_mpi_set_dirtybit_cuda(int nargs, op_arg *args) { + for (int n = 0; n < nargs; n++) { + if ((args[n].opt == 1) && (args[n].argtype == OP_ARG_DAT) && + (args[n].acc == OP_INC || args[n].acc == OP_WRITE || + args[n].acc == OP_RW)) { + args[n].dat->dirty_hd = 2; + } + } +} + +void op_mpi_wait_all_cuda(int nargs, op_arg *args) { + (void)nargs; + (void)args; +} + +void op_mpi_reset_halos(int nargs, op_arg *args) { + (void)nargs; + (void)args; +} + +void op_mpi_barrier() {} + +void *op_mpi_perf_time(const char *name, double time) { + (void)name; + (void)time; + return (void *)name; +} + +#ifdef COMM_PERF +void op_mpi_perf_comms(void *k_i, int nargs, op_arg *args) { + (void)k_i; + (void)nargs; + (void)args; +} +#endif + +void op_mpi_reduce_float(op_arg *args, float *data) { + (void)args; + (void)data; +} + +void op_mpi_reduce_double(op_arg *args, double *data) { + (void)args; + (void)data; +} + +void op_mpi_reduce_int(op_arg *args, int *data) { + (void)args; + (void)data; +} + +void op_mpi_reduce_bool(op_arg *args, bool *data) { + (void)args; + (void)data; +} + +void op_partition(const char *lib_name, const char *lib_routine, + op_set prime_set, op_map prime_map, op_dat coords) { + (void)lib_name; + (void)lib_routine; + (void)prime_set; + (void)prime_map; + (void)coords; +} + +void op_partition_ptr(const char *lib_name, const char *lib_routine, + op_set prime_set, int *prime_map, double *coords) { + (void)lib_name; + (void)lib_routine; + (void)prime_set; + (void)prime_map; + (void)coords; +} + +void op_partition_reverse() {} + +void op_compute_moment(double t, double *first, double *second) { + *first = t; + *second = t * t; +} +void op_compute_moment_across_times(double* times, int ntimes, bool ignore_zeros, double *first, double *second) { + *first = 0.0; + *second = 0.0f; + int n = 0; + for (int i=0; i +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +// +// MPI Communicator for halo creation and exchange +// + +MPI_Comm OP_MPI_WORLD; +MPI_Comm OP_MPI_GLOBAL; + +// +// HIP-specific OP2 functions +// + +void op_init(int argc, char **argv, int diags) { + op_init_soa(argc, argv, diags, 0); +} + +void op_init_soa(int argc, char **argv, int diags, int soa) { + int flag = 0; + OP_auto_soa = soa; + MPI_Initialized(&flag); + if (!flag) { + MPI_Init(&argc, &argv); + } + OP_MPI_WORLD = MPI_COMM_WORLD; + OP_MPI_GLOBAL = MPI_COMM_WORLD; + op_init_core(argc, argv, diags); + + cutilDeviceInit(argc, argv); + +// +// The following call is only made in the C version of OP2, +// as it causes memory trashing when called from Fortran. +// \warning add -DSET_CUDA_CACHE_CONFIG to compiling line +// for this file when implementing C OP2. +// + if (OP_hybrid_gpu) { +#ifdef SET_CUDA_CACHE_CONFIG + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); +#endif + cutilSafeCall(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte)); + printf("\n 16/48 L1/shared \n"); + } +} + +void op_mpi_init(int argc, char **argv, int diags, MPI_Fint global, + MPI_Fint local) { + op_mpi_init_soa(argc, argv, diags, global, local, 0); +} + +void op_mpi_init_soa(int argc, char **argv, int diags, MPI_Fint global, + MPI_Fint local, int soa) { + OP_auto_soa = soa; + int flag = 0; + MPI_Initialized(&flag); + if (!flag) { + printf("Error: MPI has to be initialized when calling op_mpi_init with " + "communicators\n"); + exit(-1); + } + OP_MPI_WORLD = MPI_Comm_f2c(local); + OP_MPI_GLOBAL = MPI_Comm_f2c(global); + op_init_core(argc, argv, diags); + + cutilDeviceInit(argc, argv); + +// +// The following call is only made in the C version of OP2, +// as it causes memory trashing when called from Fortran. +// \warning add -DSET_CUDA_CACHE_CONFIG to compiling line +// for this file when implementing C OP2. +// + +#ifdef SET_CUDA_CACHE_CONFIG + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); +#endif + + //cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); + cutilSafeCall(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte)); + printf("\n 16/48 L1/shared \n"); +} + +op_dat op_decl_dat_char(op_set set, int dim, char const *type, int size, + char *data, char const *name) { + if (set == NULL || data == NULL) + return NULL; + /*char *d = (char *)malloc((size_t)set->size * (size_t)dim * (size_t)size); + if (d == NULL && set->size>0) { + printf(" op_decl_dat_char error -- error allocating memory to dat\n"); + exit(-1); + } + + memcpy(d, data, set->size * dim * size * sizeof(char)); + op_dat out_dat = op_decl_dat_core(set, dim, type, size, d, name);*/ + op_dat out_dat = op_decl_dat_core(set, dim, type, size, data, name); + + op_dat_entry *item; + op_dat_entry *tmp_item; + for (item = TAILQ_FIRST(&OP_dat_list); item != NULL; item = tmp_item) { + tmp_item = TAILQ_NEXT(item, entries); + if (item->dat == out_dat) { + item->orig_ptr = data; + break; + } + } + out_dat->user_managed = 0; + return out_dat; +} + +op_dat op_decl_dat_temp_char(op_set set, int dim, char const *type, int size, + char const *name) { + char *data = NULL; + op_dat dat = op_decl_dat_temp_core(set, dim, type, size, data, name); + + // create empty data block to assign to this temporary dat (including the + // halos) + int set_size = set->size + OP_import_exec_list[set->index]->size + + OP_import_nonexec_list[set->index]->size; + + // initialize data bits to 0 + for (size_t i = 0; i < set_size * dim * size; i++) + dat->data[i] = 0; + dat->user_managed = 0; + + // transpose + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + cutilSafeCall( + hipMalloc((void **)&(dat->buffer_d_r), + (size_t)dat->size * (OP_import_exec_list[set->index]->size + + OP_import_nonexec_list[set->index]->size))); + } + + op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data), + (size_t)dat->size * set_size); + + // need to allocate mpi_buffers for this new temp_dat + op_mpi_buffer mpi_buf = (op_mpi_buffer)xmalloc(sizeof(op_mpi_buffer_core)); + + halo_list exec_e_list = OP_export_exec_list[set->index]; + halo_list nonexec_e_list = OP_export_nonexec_list[set->index]; + + mpi_buf->buf_exec = (char *)xmalloc((exec_e_list->size) * (size_t)dat->size); + mpi_buf->buf_nonexec = (char *)xmalloc((nonexec_e_list->size) * (size_t)dat->size); + + halo_list exec_i_list = OP_import_exec_list[set->index]; + halo_list nonexec_i_list = OP_import_nonexec_list[set->index]; + + mpi_buf->s_req = (MPI_Request *)xmalloc( + sizeof(MPI_Request) * + (exec_e_list->ranks_size + nonexec_e_list->ranks_size)); + mpi_buf->r_req = (MPI_Request *)xmalloc( + sizeof(MPI_Request) * + (exec_i_list->ranks_size + nonexec_i_list->ranks_size)); + + mpi_buf->s_num_req = 0; + mpi_buf->r_num_req = 0; + + dat->mpi_buffer = mpi_buf; + + // need to allocate device buffers for mpi comms for this new temp_dat + cutilSafeCall( + hipMalloc((void **)&(dat->buffer_d), + (size_t)dat->size * (OP_export_exec_list[set->index]->size + + OP_export_nonexec_list[set->index]->size))); + + return dat; +} + +int op_free_dat_temp_char(op_dat dat) { + // need to free mpi_buffers use in this op_dat + free(((op_mpi_buffer)(dat->mpi_buffer))->buf_exec); + free(((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec); + free(((op_mpi_buffer)(dat->mpi_buffer))->s_req); + free(((op_mpi_buffer)(dat->mpi_buffer))->r_req); + free(dat->mpi_buffer); + + // need to free device buffers used in mpi comms + cutilSafeCall(hipFree(dat->buffer_d)); + + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + cutilSafeCall(hipFree(dat->buffer_d_r)); + } + + // free data on device + cutilSafeCall(hipFree(dat->data_d)); + return op_free_dat_temp_core(dat); +} + +void op_mv_halo_device(op_set set, op_dat dat) { + int set_size = set->size + OP_import_exec_list[set->index]->size + + OP_import_nonexec_list[set->index]->size; + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + char *temp_data = (char *)malloc((size_t)dat->size * set_size * sizeof(char)); + size_t element_size = (size_t)dat->size / dat->dim; + for (size_t i = 0; i < dat->dim; i++) { + for (size_t j = 0; j < set_size; j++) { + for (size_t c = 0; c < element_size; c++) { + temp_data[element_size * i * set_size + element_size * j + c] = + dat->data[(size_t)dat->size * j + element_size * i + c]; + } + } + } + op_cpHostToDevice((void **)&(dat->data_d), (void **)&(temp_data), + (size_t)dat->size * set_size); + free(temp_data); + + if (dat->buffer_d_r != NULL) cutilSafeCall(hipFree(dat->buffer_d_r)); + cutilSafeCall( + hipMalloc((void **)&(dat->buffer_d_r), + (size_t)dat->size * (OP_import_exec_list[set->index]->size + + OP_import_nonexec_list[set->index]->size))); + + } else { + op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data), + (size_t)dat->size * set_size); + } + dat->dirty_hd = 0; + if (dat->buffer_d != NULL) cutilSafeCall(hipFree(dat->buffer_d)); + cutilSafeCall( + hipMalloc((void **)&(dat->buffer_d), + (size_t)dat->size * (OP_export_exec_list[set->index]->size + + OP_export_nonexec_list[set->index]->size + + set_import_buffer_size[set->index]))); +} + +void op_mv_halo_list_device() { + if (export_exec_list_d != NULL) { + for (int s = 0; s < OP_set_index; s++) + if (export_exec_list_d[OP_set_list[s]->index] != NULL) + cutilSafeCall(hipFree(export_exec_list_d[OP_set_list[s]->index])); + free(export_exec_list_d); + } + export_exec_list_d = (int **)xmalloc(sizeof(int *) * OP_set_index); + + for (int s = 0; s < OP_set_index; s++) { // for each set + op_set set = OP_set_list[s]; + export_exec_list_d[set->index] = NULL; + + op_cpHostToDevice((void **)&(export_exec_list_d[set->index]), + (void **)&(OP_export_exec_list[set->index]->list), + OP_export_exec_list[set->index]->size * sizeof(int)); + } + + if (export_nonexec_list_d != NULL) { + for (int s = 0; s < OP_set_index; s++) + if (export_nonexec_list_d[OP_set_list[s]->index] != NULL) + cutilSafeCall(hipFree(export_nonexec_list_d[OP_set_list[s]->index])); + free(export_nonexec_list_d); + } + export_nonexec_list_d = (int **)xmalloc(sizeof(int *) * OP_set_index); + + for (int s = 0; s < OP_set_index; s++) { // for each set + op_set set = OP_set_list[s]; + export_nonexec_list_d[set->index] = NULL; + + op_cpHostToDevice((void **)&(export_nonexec_list_d[set->index]), + (void **)&(OP_export_nonexec_list[set->index]->list), + OP_export_nonexec_list[set->index]->size * sizeof(int)); + } + + //for grouped, we need the disps array on device too + if (export_exec_list_disps_d != NULL) { + for (int s = 0; s < OP_set_index; s++) + if (export_exec_list_disps_d[OP_set_list[s]->index] != NULL) + cutilSafeCall(hipFree(export_exec_list_disps_d[OP_set_list[s]->index])); + free(export_exec_list_disps_d); + } + export_exec_list_disps_d = (int **)xmalloc(sizeof(int *) * OP_set_index); + + for (int s = 0; s < OP_set_index; s++) { // for each set + op_set set = OP_set_list[s]; + export_exec_list_disps_d[set->index] = NULL; + + //make sure end size is there too + OP_export_exec_list[set->index] + ->disps[OP_export_exec_list[set->index]->ranks_size] = + OP_export_exec_list[set->index]->ranks_size == 0 + ? 0 + : OP_export_exec_list[set->index] + ->disps[OP_export_exec_list[set->index]->ranks_size - 1] + + OP_export_exec_list[set->index] + ->sizes[OP_export_exec_list[set->index]->ranks_size - 1]; + op_cpHostToDevice((void **)&(export_exec_list_disps_d[set->index]), + (void **)&(OP_export_exec_list[set->index]->disps), + (OP_export_exec_list[set->index]->ranks_size+1) * sizeof(int)); + } + + if (export_nonexec_list_disps_d != NULL) { + for (int s = 0; s < OP_set_index; s++) + if (export_nonexec_list_disps_d[OP_set_list[s]->index] != NULL) + cutilSafeCall(hipFree(export_nonexec_list_disps_d[OP_set_list[s]->index])); + free(export_nonexec_list_disps_d); + } + export_nonexec_list_disps_d = (int **)xmalloc(sizeof(int *) * OP_set_index); + + for (int s = 0; s < OP_set_index; s++) { // for each set + op_set set = OP_set_list[s]; + export_nonexec_list_disps_d[set->index] = NULL; + + //make sure end size is there too + OP_export_nonexec_list[set->index] + ->disps[OP_export_nonexec_list[set->index]->ranks_size] = + OP_export_nonexec_list[set->index]->ranks_size == 0 + ? 0 + : OP_export_nonexec_list[set->index] + ->disps[OP_export_nonexec_list[set->index]->ranks_size - + 1] + + OP_export_nonexec_list[set->index] + ->sizes[OP_export_nonexec_list[set->index]->ranks_size - + 1]; + op_cpHostToDevice((void **)&(export_nonexec_list_disps_d[set->index]), + (void **)&(OP_export_nonexec_list[set->index]->disps), + (OP_export_nonexec_list[set->index]->ranks_size+1) * sizeof(int)); + } + if (import_exec_list_disps_d != NULL) { + for (int s = 0; s < OP_set_index; s++) + if (import_exec_list_disps_d[OP_set_list[s]->index] != NULL) + cutilSafeCall(hipFree(import_exec_list_disps_d[OP_set_list[s]->index])); + free(import_exec_list_disps_d); + } + import_exec_list_disps_d = (int **)xmalloc(sizeof(int *) * OP_set_index); + + for (int s = 0; s < OP_set_index; s++) { // for each set + op_set set = OP_set_list[s]; + import_exec_list_disps_d[set->index] = NULL; + + //make sure end size is there too + OP_import_exec_list[set->index] + ->disps[OP_import_exec_list[set->index]->ranks_size] = + OP_import_exec_list[set->index]->ranks_size == 0 + ? 0 + : OP_import_exec_list[set->index] + ->disps[OP_import_exec_list[set->index]->ranks_size - 1] + + OP_import_exec_list[set->index] + ->sizes[OP_import_exec_list[set->index]->ranks_size - 1]; + op_cpHostToDevice((void **)&(import_exec_list_disps_d[set->index]), + (void **)&(OP_import_exec_list[set->index]->disps), + (OP_import_exec_list[set->index]->ranks_size+1) * sizeof(int)); + } + + if (import_nonexec_list_disps_d != NULL) { + for (int s = 0; s < OP_set_index; s++) + if (import_nonexec_list_disps_d[OP_set_list[s]->index] != NULL) + cutilSafeCall(hipFree(import_nonexec_list_disps_d[OP_set_list[s]->index])); + free(import_nonexec_list_disps_d); + } + import_nonexec_list_disps_d = (int **)xmalloc(sizeof(int *) * OP_set_index); + + for (int s = 0; s < OP_set_index; s++) { // for each set + op_set set = OP_set_list[s]; + import_nonexec_list_disps_d[set->index] = NULL; + + //make sure end size is there too + OP_import_nonexec_list[set->index] + ->disps[OP_import_nonexec_list[set->index]->ranks_size] = + OP_import_nonexec_list[set->index]->ranks_size == 0 + ? 0 + : OP_import_nonexec_list[set->index] + ->disps[OP_import_nonexec_list[set->index]->ranks_size - + 1] + + OP_import_nonexec_list[set->index] + ->sizes[OP_import_nonexec_list[set->index]->ranks_size - + 1]; + op_cpHostToDevice((void **)&(import_nonexec_list_disps_d[set->index]), + (void **)&(OP_import_nonexec_list[set->index]->disps), + (OP_import_nonexec_list[set->index]->ranks_size+1) * sizeof(int)); + } + + if ( export_nonexec_list_partial_d!= NULL) { + for (int s = 0; s < OP_map_index; s++) + if (OP_map_partial_exchange[s] && export_nonexec_list_partial_d[OP_map_list[s]->index] != NULL) + cutilSafeCall(hipFree(export_nonexec_list_partial_d[OP_map_list[s]->index])); + free(export_nonexec_list_partial_d); + } + export_nonexec_list_partial_d = (int **)calloc(sizeof(int *) * OP_map_index,1); + + for (int s = 0; s < OP_map_index; s++) { // for each set + if (!OP_map_partial_exchange[s]) + continue; + op_map map = OP_map_list[s]; + export_nonexec_list_partial_d[map->index] = NULL; + + op_cpHostToDevice((void **)&(export_nonexec_list_partial_d[map->index]), + (void **)&(OP_export_nonexec_permap[map->index]->list), + OP_export_nonexec_permap[map->index]->size * sizeof(int)); + } + + if ( import_nonexec_list_partial_d!= NULL) { + for (int s = 0; s < OP_map_index; s++) + if (OP_map_partial_exchange[s] && import_nonexec_list_partial_d[OP_map_list[s]->index] != NULL) + cutilSafeCall(hipFree(import_nonexec_list_partial_d[OP_map_list[s]->index])); + free(import_nonexec_list_partial_d); + } + import_nonexec_list_partial_d = (int **)calloc(sizeof(int *) * OP_map_index,1); + + for (int s = 0; s < OP_map_index; s++) { // for each set + if (!OP_map_partial_exchange[s]) + continue; + op_map map = OP_map_list[s]; + import_nonexec_list_partial_d[map->index] = NULL; + + op_cpHostToDevice((void **)&(import_nonexec_list_partial_d[map->index]), + (void **)&(OP_import_nonexec_permap[map->index]->list), + OP_import_nonexec_permap[map->index]->size * sizeof(int)); + } +} + +op_set op_decl_set(int size, char const *name) { + return op_decl_set_core(size, name); +} + +op_map op_decl_map(op_set from, op_set to, int dim, int *imap, + char const *name) { + // int *m = (int *)xmalloc(from->size * dim * sizeof(int)); + // memcpy(m, imap, from->size * dim * sizeof(int)); + op_map out_map = op_decl_map_core(from, to, dim, imap, name); + out_map->user_managed = 0; + return out_map; + // return op_decl_map_core ( from, to, dim, imap, name ); +} + +op_arg op_arg_dat(op_dat dat, int idx, op_map map, int dim, char const *type, + op_access acc) { + return op_arg_dat_core(dat, idx, map, dim, type, acc); +} + +op_arg op_opt_arg_dat(int opt, op_dat dat, int idx, op_map map, int dim, + char const *type, op_access acc) { + return op_opt_arg_dat_core(opt, dat, idx, map, dim, type, acc); +} + +op_arg op_arg_gbl_char(char *data, int dim, const char *type, int size, + op_access acc) { + return op_arg_gbl_core(1, data, dim, type, size, acc); +} + +op_arg op_opt_arg_gbl_char(int opt, char *data, int dim, const char *type, + int size, op_access acc) { + return op_arg_gbl_core(opt, data, dim, type, size, acc); +} + +void op_printf(const char *format, ...) { + int my_rank; + MPI_Comm_rank(OP_MPI_WORLD, &my_rank); + if (my_rank == MPI_ROOT) { + va_list argptr; + va_start(argptr, format); + vprintf(format, argptr); + va_end(argptr); + } +} + +void op_print(const char *line) { + int my_rank; + MPI_Comm_rank(OP_MPI_WORLD, &my_rank); + if (my_rank == MPI_ROOT) { + printf("%s\n", line); + } +} + +void op_timers(double *cpu, double *et) { + MPI_Barrier(OP_MPI_WORLD); + op_timers_core(cpu, et); +} + +// +// This function is defined in the generated master kernel file +// so that it is possible to check on the runtime size of the +// data in cases where it is not known at compile time +// + +/* +void +op_decl_const_char ( int dim, char const * type, int size, char * dat, + char const * name ) +{ + cutilSafeCall ( hipMemcpyToSymbol(HIP_SYMBOL(name), dat, dim * size, 0, + hipMemcpyHostToDevice ) ); +} +*/ + +void op_exit() { + // need to free buffer_d used for mpi comms in each op_dat + if (OP_hybrid_gpu) { + op_dat_entry *item; + TAILQ_FOREACH(item, &OP_dat_list, entries) { + if (strstr(item->dat->type, ":soa") != NULL || + (OP_auto_soa && item->dat->dim > 1)) { + cutilSafeCall(hipFree((item->dat)->buffer_d_r)); + } + cutilSafeCall(hipFree((item->dat)->buffer_d)); + } + + for (int i = 0; i < OP_set_index; i++) { + if (export_exec_list_d[i] != NULL) + cutilSafeCall(hipFree(export_exec_list_d[i])); + if (export_nonexec_list_d[i] != NULL) + cutilSafeCall(hipFree(export_nonexec_list_d[i])); + } + for (int i = 0; i < OP_map_index; i++) { + if (!OP_map_partial_exchange[i]) + continue; + cutilSafeCall(hipFree(export_nonexec_list_partial_d[i])); + cutilSafeCall(hipFree(import_nonexec_list_partial_d[i])); + } + } + + op_mpi_exit(); + op_cuda_exit(); // frees dat_d memory + op_rt_exit(); // frees plan memory + op_exit_core(); // frees lib core variables + + int flag = 0; + MPI_Finalized(&flag); + if (!flag) + MPI_Finalize(); +} + +void op_timing_output() { + op_timing_output_core(); + printf("Total plan time: %8.4f\n", OP_plan_time); +} + +void op_timings_to_csv(const char *outputFileName) { + int comm_size, comm_rank; + MPI_Comm_size(OP_MPI_WORLD, &comm_size); + MPI_Comm_rank(OP_MPI_WORLD, &comm_rank); + + FILE * outputFile = NULL; + if (op_is_root()) { + outputFile = fopen(outputFileName, "w"); + if (outputFile == NULL) { + printf("ERROR: Failed to open file for writing: '%s'\n", outputFileName); + } + else { + fprintf(outputFile, "rank,thread,nranks,nthreads,count,total time,plan time,mpi time,GB used,GB total,kernel name\n"); + } + } + + bool can_write = (outputFile != NULL); + MPI_Bcast(&can_write, 1, MPI_INT, MPI_ROOT, OP_MPI_WORLD); + + if (can_write) { + for (int n = 0; n < OP_kern_max; n++) { + if (OP_kernels[n].count > 0) { + if (OP_kernels[n].ntimes == 1 && OP_kernels[n].times[0] == 0.0f && + OP_kernels[n].time != 0.0f) { + // This library is being used by an OP2 translation made with the + // older + // translator with older timing logic. Adjust to new logic: + OP_kernels[n].times[0] = OP_kernels[n].time; + } + + if (op_is_root()) { + double times[OP_kernels[n].ntimes*comm_size]; + for (int i=0; i<(OP_kernels[n].ntimes*comm_size); i++) times[i] = 0.0f; + MPI_Gather(OP_kernels[n].times, OP_kernels[n].ntimes, MPI_DOUBLE, times, OP_kernels[n].ntimes, MPI_DOUBLE, MPI_ROOT, OP_MPI_WORLD); + + float plan_times[comm_size]; + for (int i=0; idata); + free(temp->set); + free(temp); +} + +void op_print_dat_to_txtfile(op_dat dat, const char *file_name) { + // need to get data from GPU + op_cuda_get_data(dat); + + // rearrange data backe to original order in mpi + op_dat temp = op_mpi_get_data(dat); + print_dat_to_txtfile_mpi(temp, file_name); + + free(temp->data); + free(temp->set); + free(temp); +} + +void op_upload_all() { + op_dat_entry *item; + TAILQ_FOREACH(item, &OP_dat_list, entries) { + op_dat dat = item->dat; + int set_size = dat->set->size + OP_import_exec_list[dat->set->index]->size + + OP_import_nonexec_list[dat->set->index]->size; + if (dat->data_d) { + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + char *temp_data = (char *)malloc((size_t)dat->size * set_size * sizeof(char)); + size_t element_size = (size_t)dat->size / dat->dim; + for (size_t i = 0; i < dat->dim; i++) { + for (size_t j = 0; j < set_size; j++) { + for (size_t c = 0; c < element_size; c++) { + temp_data[element_size * i * set_size + element_size * j + c] = + dat->data[(size_t)dat->size * j + element_size * i + c]; + } + } + } + cutilSafeCall(hipMemcpy(dat->data_d, temp_data, (size_t)dat->size * set_size, + hipMemcpyHostToDevice)); + dat->dirty_hd = 0; + free(temp_data); + } else { + cutilSafeCall(hipMemcpy(dat->data_d, dat->data, (size_t)dat->size * set_size, + hipMemcpyHostToDevice)); + dat->dirty_hd = 0; + } + } + } +} + +void op_fetch_data_char(op_dat dat, char *usr_ptr) { + // need to get data from GPU + op_cuda_get_data(dat); + + // rearrange data backe to original order in mpi + op_dat temp = op_mpi_get_data(dat); + + // copy data into usr_ptr + memcpy((void *)usr_ptr, (void *)temp->data, temp->set->size * temp->size); + free(temp->data); + free(temp->set); + free(temp); +} + +op_dat op_fetch_data_file_char(op_dat dat) { + // need to get data from GPU + op_cuda_get_data(dat); + // rearrange data backe to original order in mpi + return op_mpi_get_data(dat); +} + +void op_fetch_data_idx_char(op_dat dat, char *usr_ptr, int low, int high) { + // need to get data from GPU + op_cuda_get_data(dat); + + // rearrange data backe to original order in mpi + op_dat temp = op_mpi_get_data(dat); + + // do allgather on temp->data and copy it to memory block pointed to by + // use_ptr + fetch_data_hdf5(temp, usr_ptr, low, high); + + free(temp->data); + free(temp->set); + free(temp); +} diff --git a/op2/src/mpi/op_mpi_hip_kernels.cpp b/op2/src/mpi/op_mpi_hip_kernels.cpp new file mode 100644 index 000000000..e9c7cd661 --- /dev/null +++ b/op2/src/mpi/op_mpi_hip_kernels.cpp @@ -0,0 +1,453 @@ +#include "hip/hip_runtime.h" +/* + * Open source copyright declaration based on BSD open source template: + * http://www.opensource.org/licenses/bsd-license.php + * + * This file is part of the OP2 distribution. + * + * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in + * the main source directory for a full list of copyright holders. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * The name of Mike Giles may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define OP_MPI_CORE_NOMPI +#include + +#include +#include +#include +#include + +__global__ void export_halo_gather(int *list, char *dat, int copy_size, + int elem_size, char *export_buffer) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < copy_size) { + int off = 0; + if (elem_size % 16 == 0) { + off += 16 * (elem_size / 16); + for (int i = 0; i < elem_size / 16; i++) { + ((double2 *)(export_buffer + id * elem_size))[i] = + ((double2 *)(dat + list[id] * elem_size))[i]; + } + } else if (elem_size % 8 == 0) { + off += 8 * (elem_size / 8); + for (int i = 0; i < elem_size / 8; i++) { + ((double *)(export_buffer + id * elem_size))[i] = + ((double *)(dat + list[id] * elem_size))[i]; + } + } + for (int i = off; i < elem_size; i++) { + export_buffer[id * elem_size + i] = dat[list[id] * elem_size + i]; + } + } +} + +__global__ void export_halo_gather_soa(int *list, char *dat, int copy_size, + int elem_size, char *export_buffer, + int set_size, int dim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int size_of = elem_size / dim; + if (id < copy_size) { + if (size_of == 8) { + for (int i = 0; i < dim; i++) { + ((double *)(export_buffer + id * elem_size))[i] = + ((double *)(dat + list[id] * size_of))[i * set_size]; + } + } else { + for (int i = 0; i < dim; i++) { + for (int j = 0; j < size_of; j++) { + export_buffer[id * elem_size + i * size_of + j] = + dat[list[id] * size_of + i * set_size * size_of + j]; + } + } + } + } +} + +__global__ void import_halo_scatter_soa(int offset, char *dat, int copy_size, + int elem_size, char *import_buffer, + int set_size, int dim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int size_of = elem_size / dim; + if (id < copy_size) { + if (size_of == 8) { + for (int i = 0; i < dim; i++) { + ((double *)(dat + (offset + id) * size_of))[i * set_size] = + ((double *)(import_buffer + id * elem_size))[i]; + } + } else { + for (int i = 0; i < dim; i++) { + for (int j = 0; j < size_of; j++) { + dat[(offset + id) * size_of + i * set_size * size_of + j] = + import_buffer[id * elem_size + i * size_of + j]; + } + } + } + } +} + +__global__ void import_halo_scatter_partial_soa(int *list, char *dat, + int copy_size, int elem_size, + char *import_buffer, + int set_size, int dim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int size_of = elem_size / dim; + if (id < copy_size) { + int element = list[id]; + if (size_of == 8) { + for (int i = 0; i < dim; i++) { + ((double *)(dat + (element)*size_of))[i * set_size] = + ((double *)(import_buffer + id * elem_size))[i]; + } + } else { + for (int i = 0; i < dim; i++) { + for (int j = 0; j < size_of; j++) { + dat[(element)*size_of + i * set_size * size_of + j] = + import_buffer[id * elem_size + i * size_of + j]; + } + } + } + } +} + +__global__ void import_halo_scatter_partial(int *list, char *dat, int copy_size, + int elem_size, char *import_buffer, + int dim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int size_of = elem_size / dim; + if (id < copy_size) { + int element = list[id]; + if (size_of == 8) { + for (int i = 0; i < dim; i++) { + ((double *)(dat + element * elem_size))[i] = + ((double *)(import_buffer + id * elem_size))[i]; + } + } else { + for (int i = 0; i < dim; i++) { + for (int j = 0; j < size_of; j++) { + dat[element * elem_size + i * size_of + j] = + import_buffer[id * elem_size + i * size_of + j]; + } + } + } + } +} + +void gather_data_to_buffer(op_arg arg, halo_list exp_exec_list, + halo_list exp_nonexec_list) { + int threads = 192; + int blocks = 1 + ((exp_exec_list->size - 1) / 192); + + if (strstr(arg.dat->type, ":soa") != NULL || + (OP_auto_soa && arg.dat->dim > 1)) { + + int set_size = arg.dat->set->size + arg.dat->set->exec_size + + arg.dat->set->nonexec_size; + + hipLaunchKernelGGL(export_halo_gather_soa, blocks, threads, 0, 0, + export_exec_list_d[arg.dat->set->index], arg.data_d, + exp_exec_list->size, arg.dat->size, arg.dat->buffer_d, set_size, + arg.dat->dim); + + int blocks2 = 1 + ((exp_nonexec_list->size - 1) / 192); + hipLaunchKernelGGL(export_halo_gather_soa, blocks2, threads, 0, 0, + export_nonexec_list_d[arg.dat->set->index], arg.data_d, + exp_nonexec_list->size, arg.dat->size, + arg.dat->buffer_d + exp_exec_list->size * arg.dat->size, set_size, + arg.dat->dim); + } else { + hipLaunchKernelGGL(export_halo_gather, blocks, threads, 0, 0, + export_exec_list_d[arg.dat->set->index], arg.data_d, + exp_exec_list->size, arg.dat->size, arg.dat->buffer_d); + + int blocks2 = 1 + ((exp_nonexec_list->size - 1) / 192); + hipLaunchKernelGGL(export_halo_gather, blocks2, threads, 0, 0, + export_nonexec_list_d[arg.dat->set->index], arg.data_d, + exp_nonexec_list->size, arg.dat->size, + arg.dat->buffer_d + exp_exec_list->size * arg.dat->size); + } +} + +void gather_data_to_buffer_partial(op_arg arg, halo_list exp_nonexec_list) { + int threads = 192; + int blocks = 1 + ((exp_nonexec_list->size - 1) / 192); + + if (strstr(arg.dat->type, ":soa") != NULL || + (OP_auto_soa && arg.dat->dim > 1)) { + + int set_size = arg.dat->set->size + arg.dat->set->exec_size + + arg.dat->set->nonexec_size; + + hipLaunchKernelGGL(export_halo_gather_soa, blocks, threads, 0, 0, + export_nonexec_list_partial_d[arg.map->index], arg.data_d, + exp_nonexec_list->size, arg.dat->size, arg.dat->buffer_d, set_size, + arg.dat->dim); + } else { + hipLaunchKernelGGL(export_halo_gather, blocks, threads, 0, 0, + export_nonexec_list_partial_d[arg.map->index], arg.data_d, + exp_nonexec_list->size, arg.dat->size, arg.dat->buffer_d); + } +} + +void scatter_data_from_buffer(op_arg arg) { + int threads = 192; + int blocks = 1 + ((arg.dat->set->exec_size - 1) / 192); + + if (strstr(arg.dat->type, ":soa") != NULL || + (OP_auto_soa && arg.dat->dim > 1)) { + + int set_size = arg.dat->set->size + arg.dat->set->exec_size + + arg.dat->set->nonexec_size; + int offset = arg.dat->set->size; + int copy_size = arg.dat->set->exec_size; + + hipLaunchKernelGGL(import_halo_scatter_soa, blocks, threads, 0, 0, + offset, arg.data_d, copy_size, arg.dat->size, arg.dat->buffer_d_r, + set_size, arg.dat->dim); + + offset += arg.dat->set->exec_size; + copy_size = arg.dat->set->nonexec_size; + + int blocks2 = 1 + ((arg.dat->set->nonexec_size - 1) / 192); + hipLaunchKernelGGL(import_halo_scatter_soa, blocks2, threads, 0, 0, + offset, arg.data_d, copy_size, arg.dat->size, + arg.dat->buffer_d_r + arg.dat->set->exec_size * arg.dat->size, set_size, + arg.dat->dim); + } +} + +void scatter_data_from_buffer_partial(op_arg arg) { + int threads = 192; + int blocks = 1 + ((OP_import_nonexec_permap[arg.map->index]->size - 1) / 192); + + if (strstr(arg.dat->type, ":soa") != NULL || + (OP_auto_soa && arg.dat->dim > 1)) { + + int set_size = arg.dat->set->size + arg.dat->set->exec_size + + arg.dat->set->nonexec_size; + int init = OP_export_nonexec_permap[arg.map->index]->size; + int copy_size = OP_import_nonexec_permap[arg.map->index]->size; + + hipLaunchKernelGGL(import_halo_scatter_partial_soa, blocks, threads, 0, 0, + import_nonexec_list_partial_d[arg.map->index], arg.data_d, copy_size, + arg.dat->size, arg.dat->buffer_d + init * arg.dat->size, set_size, + arg.dat->dim); + } else { + int init = OP_export_nonexec_permap[arg.map->index]->size; + int copy_size = OP_import_nonexec_permap[arg.map->index]->size; + + hipLaunchKernelGGL(import_halo_scatter_partial, blocks, threads, 0, 0, + import_nonexec_list_partial_d[arg.map->index], arg.data_d, copy_size, + arg.dat->size, arg.dat->buffer_d + init * arg.dat->size, arg.dat->dim); + } +} + +__device__ int lower_bound(int *disps, int count, int value) { + int *it; + int *first = disps; + int step; + while (count > 0) { + it = first; + step = count / 2; + it += step; + if (*it < value) { + first = ++it; + count -= step + 1; + } + else + count = step; + } + return first-disps; +} + +__global__ void gather_data_to_buffer_ptr_cuda_kernel(const char *__restrict data, char *__restrict buffer, int *elem_list, int *disps, + unsigned *neigh_to_neigh_offsets, int rank_size, int soa, int type_size, int dim, int set_size) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id >= disps[rank_size]) return; + int neighbour = lower_bound(disps, rank_size, id); + if (disps[neighbour]!=id) neighbour--; + unsigned buf_pos = neigh_to_neigh_offsets[neighbour]; + unsigned set_elem_index = elem_list[id]; + if (soa) { + for (int d = 0; d < dim; d++) + if (type_size == 8 && (buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size)%8==0) + *(double*)&buffer[buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size] = *(double*)&data[(d*set_size + set_elem_index)*type_size]; + else + for (int p = 0; p < type_size; p++) + buffer[buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size + p] = data[(d*set_size + set_elem_index)*type_size + p]; + + } else { + int dat_size = type_size * dim; + if (type_size == 8 && (buf_pos + (id - disps[neighbour]) * dat_size)%8==0) + for (int d = 0; d < dim; d++) + *(double*)&buffer[buf_pos + (id - disps[neighbour]) * dat_size + d*type_size] = *(double*)&data[set_elem_index*dat_size + d*type_size]; + else + for (int p = 0; p < dat_size; p++) + buffer[buf_pos + (id - disps[neighbour]) * dat_size + p] = data[set_elem_index*dat_size + p]; + } +} + +__global__ void scatter_data_from_buffer_ptr_cuda_kernel(char * __restrict data, const char * __restrict buffer, int *disps, + unsigned *neigh_to_neigh_offsets, int rank_size, int soa, int type_size, int dim, int set_size) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id >= disps[rank_size]) return; + int neighbour = lower_bound(disps, rank_size, id); + if (disps[neighbour]!=id) neighbour--; + unsigned buf_pos = neigh_to_neigh_offsets[neighbour]; + if (soa) { + for (int d = 0; d < dim; d++) + if (type_size == 8 && (buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size)%8==0) + *(double*)&data[(d*set_size + id)*type_size] = *(double*)&buffer[buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size]; + else + for (int p = 0; p < type_size; p++) + data[(d*set_size + id)*type_size + p] = buffer[buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size + p]; + } else { + int dat_size = type_size * dim; + // if (*(double*)&buffer[buf_pos + (id - disps[neighbour]) * dat_size] != *(double*)&data[id*dat_size]) + // printf("Mismatch\n"); + if (type_size == 8 && (buf_pos + (id - disps[neighbour]) * dat_size)%8==0) + for (int d = 0; d < dim; d++) + *(double*)&data[id*dat_size + d*type_size] = *(double*)&buffer[buf_pos + (id - disps[neighbour]) * dat_size + d*type_size]; + else + for (int p = 0; p < dat_size; p++) + data[id*dat_size + p] = buffer[buf_pos + (id - disps[neighbour]) * dat_size + p]; + } +} + + +unsigned *op2_grp_neigh_to_neigh_offsets_h = NULL; +unsigned *op2_grp_neigh_to_neigh_offsets_d = NULL; +int op2_grp_max_gathers = 10; +extern int op2_grp_counter; +int op2_grp_max_neighbours = 0; + +void check_realloc_buffer() { + //Figure out how much space may need at most + if (op2_grp_neigh_to_neigh_offsets_h == NULL) { + for (int i = 0; i < OP_set_index; i++) { + op2_grp_max_neighbours = MAX(op2_grp_max_neighbours,OP_export_exec_list[i]->ranks_size); + op2_grp_max_neighbours = MAX(op2_grp_max_neighbours,OP_export_nonexec_list[i]->ranks_size); + op2_grp_max_neighbours = MAX(op2_grp_max_neighbours,OP_import_exec_list[i]->ranks_size); + op2_grp_max_neighbours = MAX(op2_grp_max_neighbours,OP_import_nonexec_list[i]->ranks_size); + } + //Need host buffers for each dat in flight + cutilSafeCall(hipHostMalloc(&op2_grp_neigh_to_neigh_offsets_h, op2_grp_max_gathers * op2_grp_max_neighbours * sizeof(unsigned))); + //But just one device buffer if gather kernels are sequential + cutilSafeCall(hipMalloc (&op2_grp_neigh_to_neigh_offsets_d, op2_grp_max_neighbours * sizeof(unsigned))); + } + if (op2_grp_counter >= op2_grp_max_gathers) { + cutilSafeCall(hipDeviceSynchronize()); + cutilSafeCall(hipHostFree(op2_grp_neigh_to_neigh_offsets_h)); + op2_grp_max_gathers *= 2; + cutilSafeCall(hipHostMalloc(&op2_grp_neigh_to_neigh_offsets_h, op2_grp_max_gathers * op2_grp_max_neighbours * sizeof(unsigned))); + } +} + +void gather_data_to_buffer_ptr_cuda(op_arg arg, halo_list eel, halo_list enl, char *buffer, + std::vector& neigh_list, std::vector& neigh_offsets) { + + check_realloc_buffer(); + + int soa = 0; + if ((OP_auto_soa && arg.dat->dim > 1) || strstr(arg.dat->type, ":soa") != NULL) soa = 1; + + //Exec halo + + //Create op2_grp_neigh_to_neigh_offsets_h into appropriate position + for (int i = 0; i < eel->ranks_size; i++) { + int dest_rank = eel->ranks[i]; + int buf_rankpos = std::distance(neigh_list.begin(),std::lower_bound(neigh_list.begin(), neigh_list.end(), dest_rank)); + op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours+i] = neigh_offsets[buf_rankpos]; + neigh_offsets[buf_rankpos] += eel->sizes[i] * arg.dat->size; + } + //Async upload + hipMemcpyAsync(op2_grp_neigh_to_neigh_offsets_d,&op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours],eel->ranks_size * sizeof(unsigned),hipMemcpyHostToDevice); + //Launch kernel + gather_data_to_buffer_ptr_cuda_kernel<<<1 + ((eel->size - 1) / 192),192>>>(arg.dat->data_d, buffer, export_exec_list_d[arg.dat->set->index], export_exec_list_disps_d[arg.dat->set->index], + op2_grp_neigh_to_neigh_offsets_d, eel->ranks_size, soa, arg.dat->size/arg.dat->dim, arg.dat->dim, arg.dat->set->size+arg.dat->set->exec_size+arg.dat->set->nonexec_size); + op2_grp_counter++; + + //Same for nonexec + + //Create op2_grp_neigh_to_neigh_offsets_h into appropriate position + for (int i = 0; i < enl->ranks_size; i++) { + int dest_rank = enl->ranks[i]; + int buf_rankpos = std::distance(neigh_list.begin(),std::lower_bound(neigh_list.begin(), neigh_list.end(), dest_rank)); + op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours+i] = neigh_offsets[buf_rankpos]; + neigh_offsets[buf_rankpos] += enl->sizes[i] * arg.dat->size; + } + //Async upload + hipMemcpyAsync(op2_grp_neigh_to_neigh_offsets_d,&op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours],enl->ranks_size * sizeof(unsigned),hipMemcpyHostToDevice); + //Launch kernel + gather_data_to_buffer_ptr_cuda_kernel<<<1 + ((enl->size - 1) / 192),192>>>(arg.dat->data_d, buffer, export_nonexec_list_d[arg.dat->set->index], export_nonexec_list_disps_d[arg.dat->set->index], + op2_grp_neigh_to_neigh_offsets_d, enl->ranks_size, soa, arg.dat->size/arg.dat->dim, arg.dat->dim, arg.dat->set->size+arg.dat->set->exec_size+arg.dat->set->nonexec_size); + + op2_grp_counter++; + +} + +void scatter_data_from_buffer_ptr_cuda(op_arg arg, halo_list iel, halo_list inl, char *buffer, + std::vector& neigh_list, std::vector& neigh_offsets) { + + check_realloc_buffer(); + + int soa = 0; + if ((OP_auto_soa && arg.dat->dim > 1) || strstr(arg.dat->type, ":soa") != NULL) soa = 1; + + //Exec halo + + //Create op2_grp_neigh_to_neigh_offsets_h into appropriate position + for (int i = 0; i < iel->ranks_size; i++) { + int dest_rank = iel->ranks[i]; + int buf_rankpos = std::distance(neigh_list.begin(),std::lower_bound(neigh_list.begin(), neigh_list.end(), dest_rank)); + op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours+i] = neigh_offsets[buf_rankpos]; + neigh_offsets[buf_rankpos] += iel->sizes[i] * arg.dat->size; + } + //Async upload + hipMemcpyAsync(op2_grp_neigh_to_neigh_offsets_d,&op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours],iel->ranks_size * sizeof(unsigned),hipMemcpyHostToDevice,op2_grp_secondary); + //Launch kernel + unsigned offset = arg.dat->set->size * (soa?arg.dat->size/arg.dat->dim:arg.dat->size); + scatter_data_from_buffer_ptr_cuda_kernel<<<1 + ((iel->size - 1) / 192),192,0,op2_grp_secondary>>>(arg.dat->data_d+offset, buffer, import_exec_list_disps_d[arg.dat->set->index], + op2_grp_neigh_to_neigh_offsets_d, iel->ranks_size, soa, arg.dat->size/arg.dat->dim, arg.dat->dim, arg.dat->set->size+arg.dat->set->exec_size+arg.dat->set->nonexec_size); + op2_grp_counter++; + + //Same for nonexec + + //Create op2_grp_neigh_to_neigh_offsets_h into appropriate position + for (int i = 0; i < inl->ranks_size; i++) { + int dest_rank = inl->ranks[i]; + int buf_rankpos = std::distance(neigh_list.begin(),std::lower_bound(neigh_list.begin(), neigh_list.end(), dest_rank)); + op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours+i] = neigh_offsets[buf_rankpos]; + neigh_offsets[buf_rankpos] += inl->sizes[i] * arg.dat->size; + } + //Async upload + hipMemcpyAsync(op2_grp_neigh_to_neigh_offsets_d,&op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours],inl->ranks_size * sizeof(unsigned),hipMemcpyHostToDevice,op2_grp_secondary); + //Launch kernel + offset = (arg.dat->set->size + iel->size) * (soa?arg.dat->size/arg.dat->dim:arg.dat->size); + scatter_data_from_buffer_ptr_cuda_kernel<<<1 + ((inl->size - 1) / 192),192,0,op2_grp_secondary>>>(arg.dat->data_d+offset, buffer, import_nonexec_list_disps_d[arg.dat->set->index], + op2_grp_neigh_to_neigh_offsets_d, inl->ranks_size, soa, arg.dat->size/arg.dat->dim, arg.dat->dim, arg.dat->set->size+arg.dat->set->exec_size+arg.dat->set->nonexec_size); + + op2_grp_counter++; + +} diff --git a/op2/src/mpi/op_mpi_hip_rt_support.cpp b/op2/src/mpi/op_mpi_hip_rt_support.cpp new file mode 100644 index 000000000..655cb25c2 --- /dev/null +++ b/op2/src/mpi/op_mpi_hip_rt_support.cpp @@ -0,0 +1,763 @@ +/* + * Open source copyright declaration based on BSD open source template: + * http://www.opensource.org/licenses/bsd-license.php + * + * This file is part of the OP2 distribution. + * + * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in + * the main source directory for a full list of copyright holders. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * The name of Mike Giles may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// +// This file implements the MPI+HIP-specific run-time support functions +// + +// +// header files +// + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +// Small re-declaration to avoid using struct in the C version. +// This is due to the different way in which C and C++ see structs + +typedef struct hipDeviceProp_t cudaDeviceProp_t; + +// +// export lists on the device +// + +int **export_exec_list_d = NULL; +int **export_nonexec_list_d = NULL; +int **export_exec_list_disps_d = NULL; +int **export_nonexec_list_disps_d = NULL; +int **export_nonexec_list_partial_d = NULL; +int **import_nonexec_list_partial_d = NULL; +int **import_exec_list_disps_d = NULL; +int **import_nonexec_list_disps_d = NULL; + +hipEvent_t op2_grp_download_event; +hipStream_t op2_grp_secondary; + +void cutilDeviceInit(int argc, char **argv) { + (void)argc; + (void)argv; + int deviceCount; + cutilSafeCall(hipGetDeviceCount(&deviceCount)); + if (deviceCount == 0) { + printf("cutil error: no devices supporting HIP\n"); + exit(-1); + } + printf("Trying to select a device\n"); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + // no need to ardcode this following, can be done via numawrap scripts + /*if (getenv("OMPI_COMM_WORLD_LOCAL_RANK")!=NULL) { + rank = atoi(getenv("OMPI_COMM_WORLD_LOCAL_RANK")); + } else if (getenv("MV2_COMM_WORLD_LOCAL_RANK")!=NULL) { + rank = atoi(getenv("MV2_COMM_WORLD_LOCAL_RANK")); + } else if (getenv("MPI_LOCALRANKID")!=NULL) { + rank = atoi(getenv("MPI_LOCALRANKID")); + } else { + rank = rank%deviceCount; + }*/ + + // Test we have access to a device + + // This commented out test does not work with HIP versions above 6.5 + /*float *test; + hipError_t err = hipMalloc((void **)&test, sizeof(float)); + if (err != hipSuccess) { + OP_hybrid_gpu = 0; + } else { + OP_hybrid_gpu = 1; + } + if (OP_hybrid_gpu) { + hipFree(test); + + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferL1)); + + int deviceId = -1; + hipGetDevice(&deviceId); + cudaDeviceProp_t deviceProp; + cutilSafeCall ( hipGetDeviceProperties ( &deviceProp, deviceId ) ); + printf ( "\n Using HIP device: %d %s on rank %d\n",deviceId, + deviceProp.name,rank ); + } else { + printf ( "\n Using CPU on rank %d\n",rank ); + }*/ + //omp_set_default_device(rank); +// hipError_t err = hipSetDevice(rank); + float *test; + OP_hybrid_gpu = 0; + //hipError_t err = hipMalloc((void **)&test, sizeof(float)); + for (int i = 0; i < deviceCount; i++) { + hipError_t err = hipSetDevice((i+rank)%deviceCount); + if (err == hipSuccess) { + hipError_t err = hipMalloc((void **)&test, sizeof(float)); + if (err == hipSuccess) { + OP_hybrid_gpu = 1; + break; + } + } + } + if (OP_hybrid_gpu) { + cutilSafeCall(hipFree(test)); + + cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferL1)); + + int deviceId = -1; + hipGetDevice(&deviceId); + cudaDeviceProp_t deviceProp; + cutilSafeCall(hipGetDeviceProperties(&deviceProp, deviceId)); + printf("\n Using HIP device: %d %s on rank %d\n", deviceId, + deviceProp.name, rank); + cutilSafeCall(hipStreamCreateWithFlags(&op2_grp_secondary, hipStreamNonBlocking)); + cutilSafeCall(hipEventCreateWithFlags(&op2_grp_download_event, hipEventDisableTiming)); + } else { + printf("\n Using CPU on rank %d\n", rank); + } +} + +void op_upload_dat(op_dat dat) { + if (OP_import_exec_list==NULL) return; + // printf("Uploading %s\n", dat->name); + int set_size = dat->set->size + OP_import_exec_list[dat->set->index]->size + + OP_import_nonexec_list[dat->set->index]->size; + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + char *temp_data = (char *)xmalloc(dat->size * set_size * sizeof(char)); + int element_size = dat->size / dat->dim; + for (int i = 0; i < dat->dim; i++) { + for (int j = 0; j < set_size; j++) { + for (int c = 0; c < element_size; c++) { + temp_data[element_size * i * set_size + element_size * j + c] = + dat->data[dat->size * j + element_size * i + c]; + } + } + } + cutilSafeCall(hipMemcpy(dat->data_d, temp_data, set_size * dat->size, + hipMemcpyHostToDevice)); + free(temp_data); + } else { + cutilSafeCall(hipMemcpy(dat->data_d, dat->data, set_size * dat->size, + hipMemcpyHostToDevice)); + } +} + +void op_download_dat(op_dat dat) { + //Check if partitionig is done + if (OP_import_exec_list==NULL) return; + // printf("Downloading %s\n", dat->name); + int set_size = dat->set->size + OP_import_exec_list[dat->set->index]->size + + OP_import_nonexec_list[dat->set->index]->size; + if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + char *temp_data = (char *)xmalloc(dat->size * set_size * sizeof(char)); + cutilSafeCall(hipMemcpy(temp_data, dat->data_d, set_size * dat->size, + hipMemcpyDeviceToHost)); + int element_size = dat->size / dat->dim; + for (int i = 0; i < dat->dim; i++) { + for (int j = 0; j < set_size; j++) { + for (int c = 0; c < element_size; c++) { + dat->data[dat->size * j + element_size * i + c] = + temp_data[element_size * i * set_size + element_size * j + c]; + } + } + } + free(temp_data); + } else { + cutilSafeCall(hipMemcpy(dat->data, dat->data_d, set_size * dat->size, + hipMemcpyDeviceToHost)); + } +} + +void op_exchange_halo_cuda(op_arg *arg, int exec_flag) { + op_dat dat = arg->dat; + + if (arg->sent == 1) { + printf("Error: Halo exchange already in flight for dat %s\n", dat->name); + fflush(stdout); + MPI_Abort(OP_MPI_WORLD, 2); + } + + // For a directly accessed op_dat do not do halo exchanges if not executing + // over + // redundant compute block + if (exec_flag == 0 && arg->idx == -1) + return; + + arg->sent = 0; // reset flag + // need to exchange both direct and indirect data sets if they are dirty + if ((arg->opt) && + (arg->acc == OP_READ || + arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) && + (dat->dirtybit == 1)) { + + halo_list imp_exec_list = OP_import_exec_list[dat->set->index]; + halo_list imp_nonexec_list = OP_import_nonexec_list[dat->set->index]; + + halo_list exp_exec_list = OP_export_exec_list[dat->set->index]; + halo_list exp_nonexec_list = OP_export_nonexec_list[dat->set->index]; + + //-------first exchange exec elements related to this data array-------- + + // sanity checks + if (compare_sets(imp_exec_list->set, dat->set) == 0) { + printf("Error: Import list and set mismatch\n"); + MPI_Abort(OP_MPI_WORLD, 2); + } + if (compare_sets(exp_exec_list->set, dat->set) == 0) { + printf("Error: Export list and set mismatch\n"); + MPI_Abort(OP_MPI_WORLD, 2); + } + + gather_data_to_buffer(*arg, exp_exec_list, exp_nonexec_list); + + char *outptr_exec = NULL; + char *outptr_nonexec = NULL; + if (OP_gpu_direct) { + outptr_exec = arg->dat->buffer_d; + outptr_nonexec = + arg->dat->buffer_d + exp_exec_list->size * arg->dat->size; + cutilSafeCall(hipDeviceSynchronize()); + } else { + cutilSafeCall(hipMemcpy( + ((op_mpi_buffer)(dat->mpi_buffer))->buf_exec, arg->dat->buffer_d, + exp_exec_list->size * arg->dat->size, hipMemcpyDeviceToHost)); + + cutilSafeCall(hipMemcpy( + ((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec, + arg->dat->buffer_d + exp_exec_list->size * arg->dat->size, + exp_nonexec_list->size * arg->dat->size, hipMemcpyDeviceToHost)); + + cutilSafeCall(hipDeviceSynchronize()); + outptr_exec = ((op_mpi_buffer)(dat->mpi_buffer))->buf_exec; + outptr_nonexec = ((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec; + } + + for (int i = 0; i < exp_exec_list->ranks_size; i++) { + MPI_Isend(&outptr_exec[exp_exec_list->disps[i] * dat->size], + dat->size * exp_exec_list->sizes[i], MPI_CHAR, + exp_exec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]); + } + + int init = dat->set->size * dat->size; + char *ptr = NULL; + for (int i = 0; i < imp_exec_list->ranks_size; i++) { + ptr = OP_gpu_direct + ? &(dat->data_d[init + imp_exec_list->disps[i] * dat->size]) + : &(dat->data[init + imp_exec_list->disps[i] * dat->size]); + if (OP_gpu_direct && (strstr(arg->dat->type, ":soa") != NULL || + (OP_auto_soa && arg->dat->dim > 1))) + ptr = dat->buffer_d_r + imp_exec_list->disps[i] * dat->size; + MPI_Irecv(ptr, dat->size * imp_exec_list->sizes[i], MPI_CHAR, + imp_exec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]); + } + + //-----second exchange nonexec elements related to this data array------ + // sanity checks + if (compare_sets(imp_nonexec_list->set, dat->set) == 0) { + printf("Error: Non-Import list and set mismatch"); + MPI_Abort(OP_MPI_WORLD, 2); + } + if (compare_sets(exp_nonexec_list->set, dat->set) == 0) { + printf("Error: Non-Export list and set mismatch"); + MPI_Abort(OP_MPI_WORLD, 2); + } + + for (int i = 0; i < exp_nonexec_list->ranks_size; i++) { + MPI_Isend(&outptr_nonexec[exp_nonexec_list->disps[i] * dat->size], + dat->size * exp_nonexec_list->sizes[i], MPI_CHAR, + exp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]); + } + + int nonexec_init = (dat->set->size + imp_exec_list->size) * dat->size; + for (int i = 0; i < imp_nonexec_list->ranks_size; i++) { + ptr = OP_gpu_direct + ? &(dat->data_d[nonexec_init + + imp_nonexec_list->disps[i] * dat->size]) + : &(dat->data[nonexec_init + + imp_nonexec_list->disps[i] * dat->size]); + if (OP_gpu_direct && (strstr(arg->dat->type, ":soa") != NULL || + (OP_auto_soa && arg->dat->dim > 1))) + ptr = dat->buffer_d_r + + (imp_exec_list->size + imp_exec_list->disps[i]) * dat->size; + MPI_Irecv(ptr, dat->size * imp_nonexec_list->sizes[i], MPI_CHAR, + imp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]); + } + + // clear dirty bit + dat->dirtybit = 0; + arg->sent = 1; + } +} + +void op_exchange_halo_partial_cuda(op_arg *arg, int exec_flag) { + op_dat dat = arg->dat; + + if (arg->sent == 1) { + printf("Error: Halo exchange already in flight for dat %s\n", dat->name); + fflush(stdout); + MPI_Abort(OP_MPI_WORLD, 2); + } + + // For a directly accessed op_dat do not do halo exchanges if not executing + // over + // redundant compute block + if (exec_flag == 0 && arg->idx == -1) + return; + + arg->sent = 0; // reset flag + // need to exchange both direct and indirect data sets if they are dirty + if ((arg->opt) && + (arg->acc == OP_READ || + arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) && + (dat->dirtybit == 1)) { + + halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index]; + halo_list exp_nonexec_list = OP_export_nonexec_permap[arg->map->index]; + + //-------first exchange exec elements related to this data array-------- + + // sanity checks + if (compare_sets(imp_nonexec_list->set, dat->set) == 0) { + printf("Error: Import list and set mismatch\n"); + MPI_Abort(OP_MPI_WORLD, 2); + } + if (compare_sets(exp_nonexec_list->set, dat->set) == 0) { + printf("Error: Export list and set mismatch\n"); + MPI_Abort(OP_MPI_WORLD, 2); + } + + gather_data_to_buffer_partial(*arg, exp_nonexec_list); + + char *outptr_nonexec = NULL; + if (OP_gpu_direct) { + outptr_nonexec = arg->dat->buffer_d; + cutilSafeCall(hipDeviceSynchronize()); + } else { + cutilSafeCall(hipMemcpy( + ((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec, arg->dat->buffer_d, + exp_nonexec_list->size * arg->dat->size, hipMemcpyDeviceToHost)); + + cutilSafeCall(hipDeviceSynchronize()); + outptr_nonexec = ((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec; + } + + //-----second exchange nonexec elements related to this data array------ + // sanity checks + if (compare_sets(imp_nonexec_list->set, dat->set) == 0) { + printf("Error: Non-Import list and set mismatch"); + MPI_Abort(OP_MPI_WORLD, 2); + } + if (compare_sets(exp_nonexec_list->set, dat->set) == 0) { + printf("Error: Non-Export list and set mismatch"); + MPI_Abort(OP_MPI_WORLD, 2); + } + + for (int i = 0; i < exp_nonexec_list->ranks_size; i++) { + MPI_Isend(&outptr_nonexec[exp_nonexec_list->disps[i] * dat->size], + dat->size * exp_nonexec_list->sizes[i], MPI_CHAR, + exp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]); + } + + int nonexec_init = OP_export_nonexec_permap[arg->map->index]->size; + for (int i = 0; i < imp_nonexec_list->ranks_size; i++) { + char *ptr = + OP_gpu_direct + ? &arg->dat + ->buffer_d[(nonexec_init + imp_nonexec_list->disps[i]) * + dat->size] + : &((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_nonexec[(nonexec_init + imp_nonexec_list->disps[i]) * + dat->size]; + MPI_Irecv(ptr, dat->size * imp_nonexec_list->sizes[i], MPI_CHAR, + imp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]); + } + + arg->sent = 1; + } +} + +void op_exchange_halo(op_arg *arg, int exec_flag) { + op_dat dat = arg->dat; + + if (exec_flag == 0 && arg->idx == -1) + return; + if (arg->opt == 0) + return; + + if (arg->sent == 1) { + printf("Error: Halo exchange already in flight for dat %s\n", dat->name); + fflush(stdout); + MPI_Abort(OP_MPI_WORLD, 2); + } + + // need to exchange both direct and indirect data sets if they are dirty + if ((arg->acc == OP_READ || + arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) && + (dat->dirtybit == 1)) { + // printf("Exchanging Halo of data array %10s\n",dat->name); + halo_list imp_exec_list = OP_import_exec_list[dat->set->index]; + halo_list imp_nonexec_list = OP_import_nonexec_list[dat->set->index]; + + halo_list exp_exec_list = OP_export_exec_list[dat->set->index]; + halo_list exp_nonexec_list = OP_export_nonexec_list[dat->set->index]; + + //-------first exchange exec elements related to this data array-------- + + // sanity checks + if (compare_sets(imp_exec_list->set, dat->set) == 0) { + printf("Error: Import list and set mismatch\n"); + MPI_Abort(OP_MPI_WORLD, 2); + } + if (compare_sets(exp_exec_list->set, dat->set) == 0) { + printf("Error: Export list and set mismatch\n"); + MPI_Abort(OP_MPI_WORLD, 2); + } + + int set_elem_index; + for (int i = 0; i < exp_exec_list->ranks_size; i++) { + for (int j = 0; j < exp_exec_list->sizes[i]; j++) { + set_elem_index = exp_exec_list->list[exp_exec_list->disps[i] + j]; + memcpy(&((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_exec[exp_exec_list->disps[i] * dat->size + + j * dat->size], + (void *)&dat->data[dat->size * (set_elem_index)], dat->size); + } + MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_exec[exp_exec_list->disps[i] * dat->size], + dat->size * exp_exec_list->sizes[i], MPI_CHAR, + exp_exec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]); + } + + int init = dat->set->size * dat->size; + for (int i = 0; i < imp_exec_list->ranks_size; i++) { + MPI_Irecv(&(dat->data[init + imp_exec_list->disps[i] * dat->size]), + dat->size * imp_exec_list->sizes[i], MPI_CHAR, + imp_exec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]); + } + + //-----second exchange nonexec elements related to this data array------ + // sanity checks + if (compare_sets(imp_nonexec_list->set, dat->set) == 0) { + printf("Error: Non-Import list and set mismatch"); + MPI_Abort(OP_MPI_WORLD, 2); + } + if (compare_sets(exp_nonexec_list->set, dat->set) == 0) { + printf("Error: Non-Export list and set mismatch"); + MPI_Abort(OP_MPI_WORLD, 2); + } + + for (int i = 0; i < exp_nonexec_list->ranks_size; i++) { + for (int j = 0; j < exp_nonexec_list->sizes[i]; j++) { + set_elem_index = exp_nonexec_list->list[exp_nonexec_list->disps[i] + j]; + memcpy(&((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_nonexec[exp_nonexec_list->disps[i] * dat->size + + j * dat->size], + (void *)&dat->data[dat->size * (set_elem_index)], dat->size); + } + MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_nonexec[exp_nonexec_list->disps[i] * dat->size], + dat->size * exp_nonexec_list->sizes[i], MPI_CHAR, + exp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]); + } + + int nonexec_init = (dat->set->size + imp_exec_list->size) * dat->size; + for (int i = 0; i < imp_nonexec_list->ranks_size; i++) { + MPI_Irecv( + &(dat->data[nonexec_init + imp_nonexec_list->disps[i] * dat->size]), + dat->size * imp_nonexec_list->sizes[i], MPI_CHAR, + imp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]); + } + // clear dirty bit + dat->dirtybit = 0; + arg->sent = 1; + } +} + +void op_exchange_halo_partial(op_arg *arg, int exec_flag) { + op_dat dat = arg->dat; + + if (arg->opt == 0) + return; + + if (arg->sent == 1) { + printf("Error: Halo exchange already in flight for dat %s\n", dat->name); + fflush(stdout); + MPI_Abort(OP_MPI_WORLD, 2); + } + arg->sent = 0; // reset flag + + // need to exchange indirect data sets if they are dirty + if ((arg->acc == OP_READ || + arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) && + (dat->dirtybit == 1)) { + int rank; + MPI_Comm_rank(OP_MPI_WORLD, &rank); + halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index]; + halo_list exp_nonexec_list = OP_export_nonexec_permap[arg->map->index]; + //-------exchange nonexec elements related to this data array and + // map-------- + + // sanity checks + if (compare_sets(imp_nonexec_list->set, dat->set) == 0) { + printf("Error: Import list and set mismatch\n"); + MPI_Abort(OP_MPI_WORLD, 2); + } + if (compare_sets(exp_nonexec_list->set, dat->set) == 0) { + printf("Error: Export list and set mismatch\n"); + MPI_Abort(OP_MPI_WORLD, 2); + } + + int set_elem_index; + for (int i = 0; i < exp_nonexec_list->ranks_size; i++) { + for (int j = 0; j < exp_nonexec_list->sizes[i]; j++) { + set_elem_index = exp_nonexec_list->list[exp_nonexec_list->disps[i] + j]; + memcpy(&((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_nonexec[exp_nonexec_list->disps[i] * dat->size + + j * dat->size], + (void *)&dat->data[dat->size * (set_elem_index)], dat->size); + } + MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_nonexec[exp_nonexec_list->disps[i] * dat->size], + dat->size * exp_nonexec_list->sizes[i], MPI_CHAR, + exp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]); + } + + int init = exp_nonexec_list->size; + for (int i = 0; i < imp_nonexec_list->ranks_size; i++) { + MPI_Irecv( + &((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_nonexec[(init + imp_nonexec_list->disps[i]) * dat->size], + dat->size * imp_nonexec_list->sizes[i], MPI_CHAR, + imp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]); + } + // note that we are not settinging the dirtybit to 0, since it's not a full + // exchange + arg->sent = 1; + } +} + +void op_wait_all_cuda(op_arg *arg) { + if (arg->opt && arg->argtype == OP_ARG_DAT && arg->sent == 1) { + op_dat dat = arg->dat; + MPI_Waitall(((op_mpi_buffer)(dat->mpi_buffer))->s_num_req, + ((op_mpi_buffer)(dat->mpi_buffer))->s_req, MPI_STATUSES_IGNORE); + MPI_Waitall(((op_mpi_buffer)(dat->mpi_buffer))->r_num_req, + ((op_mpi_buffer)(dat->mpi_buffer))->r_req, MPI_STATUSES_IGNORE); + ((op_mpi_buffer)(dat->mpi_buffer))->s_num_req = 0; + ((op_mpi_buffer)(dat->mpi_buffer))->r_num_req = 0; + + if (arg->map != OP_ID && OP_map_partial_exchange[arg->map->index]) { + halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index]; + int nonexec_init = OP_export_nonexec_permap[arg->map->index]->size; + ; + if (OP_gpu_direct == 0) + cutilSafeCall(hipMemcpyAsync( + dat->buffer_d + nonexec_init * dat->size, + &((op_mpi_buffer)(dat->mpi_buffer)) + ->buf_nonexec[nonexec_init * dat->size], + imp_nonexec_list->size * dat->size, hipMemcpyHostToDevice, 0)); + scatter_data_from_buffer_partial(*arg); + } else { + if (OP_gpu_direct == 0) { + if (strstr(arg->dat->type, ":soa") != NULL || + (OP_auto_soa && arg->dat->dim > 1)) { + int init = dat->set->size * dat->size; + int size = (dat->set->exec_size + dat->set->nonexec_size) * dat->size; + cutilSafeCall(hipMemcpyAsync(dat->buffer_d_r, dat->data + init, size, + hipMemcpyHostToDevice, 0)); + scatter_data_from_buffer(*arg); + } else { + int init = dat->set->size * dat->size; + cutilSafeCall( + hipMemcpyAsync(dat->data_d + init, dat->data + init, + (OP_import_exec_list[dat->set->index]->size + + OP_import_nonexec_list[dat->set->index]->size) * + arg->dat->size, + hipMemcpyHostToDevice, 0)); + } + } else if (strstr(arg->dat->type, ":soa") != NULL || + (OP_auto_soa && arg->dat->dim > 1)) + scatter_data_from_buffer(*arg); + } + arg->sent = 2; // set flag to indicate completed comm + } +} + +void op_wait_all(op_arg *arg) { + if (arg->opt && arg->argtype == OP_ARG_DAT && arg->sent == 1) { + op_dat dat = arg->dat; + MPI_Waitall(((op_mpi_buffer)(dat->mpi_buffer))->s_num_req, + ((op_mpi_buffer)(dat->mpi_buffer))->s_req, MPI_STATUSES_IGNORE); + MPI_Waitall(((op_mpi_buffer)(dat->mpi_buffer))->r_num_req, + ((op_mpi_buffer)(dat->mpi_buffer))->r_req, MPI_STATUSES_IGNORE); + ((op_mpi_buffer)(dat->mpi_buffer))->s_num_req = 0; + ((op_mpi_buffer)(dat->mpi_buffer))->r_num_req = 0; + if (arg->map != OP_ID && OP_map_partial_exchange[arg->map->index]) { + halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index]; + int init = OP_export_nonexec_permap[arg->map->index]->size; + char *buffer = + &((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec[init * dat->size]; + for (int i = 0; i < imp_nonexec_list->size; i++) { + int set_elem_index = imp_nonexec_list->list[i]; + memcpy((void *)&dat->data[dat->size * (set_elem_index)], + &buffer[i * dat->size], dat->size); + } + } + } + arg->sent = 0; +} + +void op_partition(const char *lib_name, const char *lib_routine, + op_set prime_set, op_map prime_map, op_dat coords) { + partition(lib_name, lib_routine, prime_set, prime_map, coords); + if (!OP_hybrid_gpu) + return; + op_move_to_device(); +} + +void op_move_to_device() { + for (int s = 0; s < OP_set_index; s++) { + op_set set = OP_set_list[s]; + op_dat_entry *item; + TAILQ_FOREACH(item, &OP_dat_list, entries) { + op_dat dat = item->dat; + + if (dat->set->index == set->index) + op_mv_halo_device(set, dat); + } + } + + for (int m = 0; m < OP_map_index; m++) { + // Upload maps in transposed form + op_map map = OP_map_list[m]; + int set_size = map->from->size + map->from->exec_size; + int *temp_map = (int *)xmalloc(map->dim * set_size * sizeof(int)); + for (int i = 0; i < map->dim; i++) { + for (int j = 0; j < set_size; j++) { + temp_map[i * set_size + j] = map->map[map->dim * j + i]; + } + } + op_cpHostToDevice((void **)&(map->map_d), (void **)&(temp_map), + map->dim * set_size * sizeof(int)); + free(temp_map); + } + + op_mv_halo_list_device(); +} + +int op_is_root() { + int my_rank; + MPI_Comm_rank(OP_MPI_WORLD, &my_rank); + return (my_rank == MPI_ROOT); +} + +int op2_grp_size_recv_old = 0; +int op2_grp_size_send_old = 0; +void op_realloc_comm_buffer(char **send_buffer_host, char **recv_buffer_host, + char **send_buffer_device, char **recv_buffer_device, int device, + unsigned size_send, unsigned size_recv) { + if (op2_grp_size_recv_old < size_recv) { + //if (*recv_buffer_host != NULL) cutilSafeCall(hipHostFree(*recv_buffer_host)); + if (*recv_buffer_device != NULL) cutilSafeCall(hipFree(*recv_buffer_device)); + cutilSafeCall(hipMalloc(recv_buffer_device, size_recv)); + //cutilSafeCall(hipMallocHost(recv_buffer_host, size_send)); + if (op2_grp_size_recv_old>0) cutilSafeCall(hipHostUnregister ( *recv_buffer_host )); + *recv_buffer_host = (char*)op_realloc(*recv_buffer_host, size_recv); + cutilSafeCall(hipHostRegister ( *recv_buffer_host, size_recv, hipHostRegisterDefault )); + op2_grp_size_recv_old = size_recv; + } + if (op2_grp_size_send_old < size_send) { + //if (*send_buffer_host != NULL) cutilSafeCall(hipHostFree(*send_buffer_host)); + if (*send_buffer_device != NULL) cutilSafeCall(hipFree(*send_buffer_device)); + cutilSafeCall(hipMalloc(send_buffer_device, size_send)); + //cutilSafeCall(hipMallocHost(send_buffer_host, size_recv)); + if (op2_grp_size_send_old>0) cutilSafeCall(hipHostUnregister ( *send_buffer_host )); + *send_buffer_host = (char*)op_realloc(*send_buffer_host, size_send); + cutilSafeCall(hipHostRegister ( *send_buffer_host, size_send, hipHostRegisterDefault )); + op2_grp_size_send_old = size_send; + } +} + +void op_download_buffer_async(char *send_buffer_device, char *send_buffer_host, unsigned size_send) { + //Make sure gather kernels on the 0 stream finished before starting download + cutilSafeCall(hipEventRecord(op2_grp_download_event,0)); + cutilSafeCall(hipStreamWaitEvent(op2_grp_secondary, op2_grp_download_event,0)); + cutilSafeCall(hipMemcpyAsync(send_buffer_host, send_buffer_device, size_send, hipMemcpyDeviceToHost, op2_grp_secondary)); +} +void op_upload_buffer_async (char *recv_buffer_device, char *recv_buffer_host, unsigned size_recv) { + cutilSafeCall(hipMemcpyAsync(recv_buffer_device, recv_buffer_host, size_recv, hipMemcpyHostToDevice, op2_grp_secondary)); +} + +void op_scatter_sync() { + cutilSafeCall(hipEventRecord(op2_grp_download_event, op2_grp_secondary)); + cutilSafeCall(hipStreamWaitEvent(0, op2_grp_download_event,0)); +} +void op_download_buffer_sync() { + cutilSafeCall(hipStreamSynchronize(op2_grp_secondary)); +} diff --git a/translator/c/op2.py b/translator/c/op2.py index 97a9997d2..ade0ccc85 100755 --- a/translator/c/op2.py +++ b/translator/c/op2.py @@ -933,6 +933,7 @@ def main(srcFilesAndDirs=sys.argv[1:]): #code generators for NVIDIA GPUs with CUDA #op2_gen_cuda(masterFile, date, consts, kernels,sets) # Optimized for Fermi GPUs op2_gen_cuda_simple(masterFile, date, consts, kernels, sets, macro_defs) # Optimized for Kepler GPUs + op2_gen_cuda_simple(masterFile, date, consts, kernels, sets, macro_defs, hip = 1) # Optimized for Kepler GPUs # generates openmp code as well as cuda code into the same file op2_gen_cuda_simple_hyb(masterFile, date, consts, kernels, sets) # CPU and GPU will then do comutations as a hybrid application diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py index 436f1537e..d3551aaaf 100644 --- a/translator/c/op2_gen_cuda_simple.py +++ b/translator/c/op2_gen_cuda_simple.py @@ -94,11 +94,15 @@ def ENDIF(): elif CPP: code('}') -def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): +def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth + cuda = 'cuda' + if hip == 1: + cuda = 'hip' + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; OP_READ = 1; OP_WRITE = 2; OP_RW = 3; @@ -748,10 +752,13 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) for d in range(0,int(dims[g_m])): + atomicAdd = 'atomicAdd' + if hip == 1: + atomicAdd = 'unsafeAtomicAdd' if soaflags[g_m]: - code('atomicAdd(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx],_l['+str(d)+']);') + code(atomicAdd+'(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx],_l['+str(d)+']);') else: - code('atomicAdd(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*],_l['+str(d)+']);') + code(atomicAdd+'(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*],_l['+str(d)+']);') if optflags[g_m]==1: ENDIF() @@ -966,12 +973,12 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): k = k + [mapnames[g_m]] IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') - code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));') + code(cuda+'MemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() if dir_soa!=-1: IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') - code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));') + code(cuda+'MemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() # @@ -1241,7 +1248,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): # update kernel record # - code('cutilSafeCall(cudaDeviceSynchronize());') + code('cutilSafeCall('+cuda+'DeviceSynchronize());') comm('update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') @@ -1266,9 +1273,9 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): ########################################################################## # output individual kernel file ########################################################################## - if not os.path.exists('cuda'): - os.makedirs('cuda') - fid = open('cuda/'+name+'_kernel.cu','w') + if not os.path.exists(cuda): + os.makedirs(cuda) + fid = open(cuda+'/'+name+'_kernel.'+('cpp' if hip==1 else 'cu'),'w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) @@ -1308,8 +1315,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif') code('#include "../user_types.h"') code('#include "op_lib_cpp.h"') - code('#include "op_cuda_rt_support.h"') - code('#include "op_cuda_reduction.h"') + code('#include "op_'+cuda+'_rt_support.h"') + code('#include "op_'+cuda+'_reduction.h"') for nc in range(0,len(consts)): code('') @@ -1321,7 +1328,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): IF('dim*sizeof('+consts[nc]['type'][1:-1]+')>MAX_CONST_SIZE') code('printf("error: MAX_CONST_SIZE not big enough\\n"); exit(1);') ENDIF() - code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));') + code('cutilSafeCall('+cuda+'MemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));') depth = depth - 2 code('}') @@ -1330,10 +1337,10 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): for nk in range(0,len(kernels)): file_text = file_text +\ - '#include "'+kernels[nk]['name']+'_kernel.cu"\n' + '#include "'+kernels[nk]['name']+'_kernel.'+('cpp' if hip==1 else 'cu')+'"\n' master = master.split('.')[0] - fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w') + fid = open(cuda+'/'+master.split('.')[0]+'_kernels.'+('cpp' if hip==1 else 'cu'),'w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() From d680600376074c1f83c6f9ebc5d5615e9de59b98 Mon Sep 17 00:00:00 2001 From: Istvan Reguly Date: Tue, 3 Jan 2023 21:23:50 +0100 Subject: [PATCH 2/5] Fixing bugs for CUDA with SoA and one of volna's kernels don't translate to vec --- translator/c/op2_gen_cuda_simple.py | 29 +++++++++++++++++++++-------- translator/c/op2_gen_mpi_vec.py | 5 ++++- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py index d3551aaaf..09c646025 100644 --- a/translator/c/op2_gen_cuda_simple.py +++ b/translator/c/op2_gen_cuda_simple.py @@ -200,6 +200,16 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0) for i in range(0,nargs): inds_staged[i] = inds_staged[i] + 1 + print(name) + no_restrict = [0]*nargs + for i in range(0,ninds): + varname = var[invinds[i]] + print(varname) + for j in range(0,nargs): + if varname == var[j] and maps[j] == OP_ID and soaflags[j]: + print('no restrict ' +varname+str(inds)) + no_restrict[invinds[i]] = 1 + break ########################################################################## # start with CUDA kernel function ########################################################################## @@ -274,12 +284,12 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0) return for i in range(0,nargs_novec): - var = signature_text.split(',')[i].strip() + var1 = signature_text.split(',')[i].strip() if kernels[nk]['soaflags'][i] and (op_color2 or not (kernels[nk]['maps'][i] == OP_MAP and kernels[nk]['accs'][i] == OP_INC)): - var = var.replace('*','') - #locate var in body and replace by adding [idx] - length = len(re.compile('\\s+\\b').split(var)) - var2 = re.compile('\\s+\\b').split(var)[length-1].strip() + var1 = var1.replace('*','') + #locate var1 in body and replace by adding [idx] + length = len(re.compile('\\s+\\b').split(var1)) + var2 = re.compile('\\s+\\b').split(var1)[length-1].strip() if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9_]*)\]'+'', var2+r'\1[(\2)*'+op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) @@ -309,10 +319,13 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0) code('int optflags,') for g_m in range(0,ninds): - if (indaccs[g_m]==OP_READ): - code('const *__restrict ,') + restrict = '' + if no_restrict[invinds[g_m]]==0: + restrict = '__restrict' + if indaccs[g_m]==OP_READ: + code('const *'+restrict+' ,') else: - code(' *__restrict ,') + code(' *'+restrict+' ,') if nmaps > 0: k = [] diff --git a/translator/c/op2_gen_mpi_vec.py b/translator/c/op2_gen_mpi_vec.py index 8dd34d810..9f2ca2e5c 100644 --- a/translator/c/op2_gen_mpi_vec.py +++ b/translator/c/op2_gen_mpi_vec.py @@ -803,7 +803,10 @@ def op2_gen_mpi_vec(master, date, consts, kernels): comm(' user kernel files') for nk in range(0,len(kernels)): - code('#include "'+kernels[nk]['name']+'_veckernel.cpp"') + if 'initBathymetry_large' in kernels[nk]['name']: + code('#include "../seq/'+kernels[nk]['name']+'_seqkernel.cpp"') + else: + code('#include "'+kernels[nk]['name']+'_veckernel.cpp"') master = master.split('.')[0] fid = open('vec/'+master.split('.')[0]+'_veckernels.cpp','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') From eb7cbe61b4a3b0ee6c69cffb7b4ca8690c74a55b Mon Sep 17 00:00:00 2001 From: Reguly Reguly Date: Fri, 28 Apr 2023 11:06:09 +0300 Subject: [PATCH 3/5] OpenMP 4 patches, atomic version --- makefiles/compilers/c/clang.mk | 4 + makefiles/compilers/c/cray.mk | 4 +- op2/include/op_cuda_rt_support.h | 4 +- op2/include/op_hip_rt_support.h | 4 +- op2/src/hip/op_hip_decl.cpp | 30 +++--- op2/src/hip/op_hip_rt_support.cpp | 4 +- op2/src/mpi/op_mpi_hip_decl.cpp | 4 +- op2/src/mpi/op_mpi_part_core.cpp | 6 +- op2/src/openmp4/op_openmp4_rt_support.cpp | 11 ++ translator/c/op2_gen_openmp4.py | 122 +++++++++++++++------- 10 files changed, 131 insertions(+), 62 deletions(-) diff --git a/makefiles/compilers/c/clang.mk b/makefiles/compilers/c/clang.mk index 345f99000..756c3b2c1 100644 --- a/makefiles/compilers/c/clang.mk +++ b/makefiles/compilers/c/clang.mk @@ -4,3 +4,7 @@ CONFIG_CC := clang CONFIG_CXX := clang++ CONFIG_CXXLINK ?= -lc++ + +CONFIG_CPP_HAS_OMP_OFFLOAD ?= true +OMP_OFFLOAD_CXXFLAGS = -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a +CONFIG_OMP_OFFLOAD_CXXFLAGS = -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a diff --git a/makefiles/compilers/c/cray.mk b/makefiles/compilers/c/cray.mk index 02671eeea..2e41f37b0 100644 --- a/makefiles/compilers/c/cray.mk +++ b/makefiles/compilers/c/cray.mk @@ -23,4 +23,6 @@ CONFIG_OMP_CPPFLAGS ?= -fopenmp CONFIG_CPP_HAS_OMP ?= true # CONFIG_OMP_OFFLOAD_CPPFLAGS ?= -CONFIG_CPP_HAS_OMP_OFFLOAD ?= false \ No newline at end of file +CONFIG_CPP_HAS_OMP_OFFLOAD ?= true +OMP_OFFLOAD_CXXFLAGS = -fopenmp +CONFIG_OMP_OFFLOAD_CXXFLAGS = -fopenmp diff --git a/op2/include/op_cuda_rt_support.h b/op2/include/op_cuda_rt_support.h index f9a761f79..4a7b21b3f 100644 --- a/op2/include/op_cuda_rt_support.h +++ b/op2/include/op_cuda_rt_support.h @@ -84,9 +84,9 @@ void cutilDeviceInit_mpi(int argc, char **argv, int mpi_rank); * routines to move arrays to/from GPU device */ -void op_mvHostToDevice(void **map, int size); +void op_mvHostToDevice(void **map, size_t size); -void op_cpHostToDevice(void **data_d, void **data_h, int size); +void op_cpHostToDevice(void **data_d, void **data_h, size_t size); void op_cuda_get_data(op_dat dat); diff --git a/op2/include/op_hip_rt_support.h b/op2/include/op_hip_rt_support.h index 78eb42908..676b5a82d 100644 --- a/op2/include/op_hip_rt_support.h +++ b/op2/include/op_hip_rt_support.h @@ -82,9 +82,9 @@ void cutilDeviceInit_mpi(int argc, char **argv, int mpi_rank); * routines to move arrays to/from GPU device */ -void op_mvHostToDevice(void **map, int size); +void op_mvHostToDevice(void **map, size_t size); -void op_cpHostToDevice(void **data_d, void **data_h, int size); +void op_cpHostToDevice(void **data_d, void **data_h, size_t size); void op_cuda_get_data(op_dat dat); diff --git a/op2/src/hip/op_hip_decl.cpp b/op2/src/hip/op_hip_decl.cpp index e3b3dc586..dc3203dbf 100644 --- a/op2/src/hip/op_hip_decl.cpp +++ b/op2/src/hip/op_hip_decl.cpp @@ -113,20 +113,20 @@ op_dat op_decl_dat_char(op_set set, int dim, char const *type, int size, if (strstr(type, ":soa") != NULL || (OP_auto_soa && dim > 1)) { char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); int element_size = dat->size / dat->dim; - for (int i = 0; i < dat->dim; i++) { - for (int j = 0; j < set_size; j++) { - for (int c = 0; c < element_size; c++) { + for (size_t i = 0; i < dat->dim; i++) { + for (size_t j = 0; j < set_size; j++) { + for (size_t c = 0; c < element_size; c++) { temp_data[element_size * i * set_size + element_size * j + c] = data[dat->size * j + element_size * i + c]; } } } op_cpHostToDevice((void **)&(dat->data_d), (void **)&(temp_data), - dat->size * set_size); + (size_t)dat->size * set_size); free(temp_data); } else { op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data), - dat->size * set_size); + (size_t)dat->size * set_size); } return dat; @@ -142,7 +142,7 @@ op_dat op_decl_dat_temp_char(op_set set, int dim, char const *type, int size, dat->user_managed = 0; op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data), - dat->size * set->size); + (size_t)dat->size * set->size); return dat; } @@ -162,14 +162,14 @@ op_map op_decl_map(op_set from, op_set to, int dim, int *imap, char const *name) { op_map map = op_decl_map_core(from, to, dim, imap, name); int set_size = map->from->size + map->from->exec_size; - int *temp_map = (int *)malloc(map->dim * set_size * sizeof(int)); - for (int i = 0; i < map->dim; i++) { - for (int j = 0; j < set_size; j++) { + int *temp_map = (int *)malloc((size_t)map->dim * set_size * sizeof(int)); + for (size_t i = 0; i < map->dim; i++) { + for (size_t j = 0; j < set_size; j++) { temp_map[i * set_size + j] = map->map[map->dim * j + i]; } } op_cpHostToDevice((void **)&(map->map_d), (void **)&(temp_map), - map->dim * set_size * sizeof(int)); + (size_t)map->dim * set_size * sizeof(int)); free(temp_map); return map; } @@ -304,20 +304,20 @@ void op_upload_all() { if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); int element_size = dat->size / dat->dim; - for (int i = 0; i < dat->dim; i++) { - for (int j = 0; j < set_size; j++) { - for (int c = 0; c < element_size; c++) { + for (size_t i = 0; i < dat->dim; i++) { + for (size_t j = 0; j < set_size; j++) { + for (size_t c = 0; c < element_size; c++) { temp_data[element_size * i * set_size + element_size * j + c] = dat->data[dat->size * j + element_size * i + c]; } } } - cutilSafeCall(hipMemcpy(dat->data_d, temp_data, dat->size * set_size, + cutilSafeCall(hipMemcpy(dat->data_d, temp_data, (size_t)dat->size * set_size, hipMemcpyHostToDevice)); dat->dirty_hd = 0; free(temp_data); } else { - cutilSafeCall(hipMemcpy(dat->data_d, dat->data, dat->size * set_size, + cutilSafeCall(hipMemcpy(dat->data_d, dat->data, (size_t)dat->size * set_size, hipMemcpyHostToDevice)); dat->dirty_hd = 0; } diff --git a/op2/src/hip/op_hip_rt_support.cpp b/op2/src/hip/op_hip_rt_support.cpp index c0ae1e637..3704124a7 100644 --- a/op2/src/hip/op_hip_rt_support.cpp +++ b/op2/src/hip/op_hip_rt_support.cpp @@ -93,7 +93,7 @@ void __cutilCheckMsg(const char *errorMessage, const char *file, // routines to move arrays to/from GPU device // -void op_mvHostToDevice(void **map, int size) { +void op_mvHostToDevice(void **map, size_t size) { if (!OP_hybrid_gpu || size == 0) return; void *tmp; @@ -104,7 +104,7 @@ void op_mvHostToDevice(void **map, int size) { *map = tmp; } -void op_cpHostToDevice(void **data_d, void **data_h, int size) { +void op_cpHostToDevice(void **data_d, void **data_h, size_t size) { if (!OP_hybrid_gpu) return; if (*data_d != NULL) cutilSafeCall(hipFree(*data_d)); diff --git a/op2/src/mpi/op_mpi_hip_decl.cpp b/op2/src/mpi/op_mpi_hip_decl.cpp index 5841e370a..ea6e04ebb 100644 --- a/op2/src/mpi/op_mpi_hip_decl.cpp +++ b/op2/src/mpi/op_mpi_hip_decl.cpp @@ -285,7 +285,7 @@ void op_mv_halo_list_device() { op_cpHostToDevice((void **)&(export_exec_list_d[set->index]), (void **)&(OP_export_exec_list[set->index]->list), - OP_export_exec_list[set->index]->size * sizeof(int)); + (size_t)OP_export_exec_list[set->index]->size * sizeof(int)); } if (export_nonexec_list_d != NULL) { @@ -302,7 +302,7 @@ void op_mv_halo_list_device() { op_cpHostToDevice((void **)&(export_nonexec_list_d[set->index]), (void **)&(OP_export_nonexec_list[set->index]->list), - OP_export_nonexec_list[set->index]->size * sizeof(int)); + (size_t)OP_export_nonexec_list[set->index]->size * sizeof(int)); } //for grouped, we need the disps array on device too diff --git a/op2/src/mpi/op_mpi_part_core.cpp b/op2/src/mpi/op_mpi_part_core.cpp index b68374169..f1efc84b3 100644 --- a/op2/src/mpi/op_mpi_part_core.cpp +++ b/op2/src/mpi/op_mpi_part_core.cpp @@ -1083,7 +1083,7 @@ static void migrate_all(int my_rank, int comm_size) { //MPI_Isend(sbuf[i], (size_t)dat->size/sizeof(double) * exp->sizes[i], MPI_DOUBLE, exp->ranks[i], // d, OP_PART_WORLD, &request_send[i]); if ((size_t)dat->size * exp->sizes[i] > (size_t)INT_MAX) printf("Integer overflow at %s: %d\n",__FILE__,__LINE__); - MPI_Isend(sbuf[i], (size_t)dat->size * exp->sizes[i], MPI_CHAR, exp->ranks[i], + MPI_Isend(sbuf[i], (size_t)dat->size/8 * exp->sizes[i], MPI_DOUBLE, exp->ranks[i], d, OP_PART_WORLD, &request_send[i]); } @@ -1096,8 +1096,8 @@ static void migrate_all(int my_rank, int comm_size) { // MPI_DOUBLE, imp->ranks[i], d, OP_PART_WORLD, // MPI_STATUS_IGNORE); if ((size_t)dat->size * imp->sizes[i] > (size_t)INT_MAX) printf("Integer overflow at %s: %d\n",__FILE__,__LINE__); - MPI_Recv(&rbuf[(size_t)imp->disps[i] * (size_t)dat->size], (size_t)dat->size * imp->sizes[i], - MPI_CHAR, imp->ranks[i], d, OP_PART_WORLD, + MPI_Recv(&rbuf[(size_t)imp->disps[i] * (size_t)dat->size], (size_t)dat->size/8 * imp->sizes[i], + MPI_DOUBLE, imp->ranks[i], d, OP_PART_WORLD, MPI_STATUS_IGNORE); } diff --git a/op2/src/openmp4/op_openmp4_rt_support.cpp b/op2/src/openmp4/op_openmp4_rt_support.cpp index afca7760a..078484915 100644 --- a/op2/src/openmp4/op_openmp4_rt_support.cpp +++ b/op2/src/openmp4/op_openmp4_rt_support.cpp @@ -310,6 +310,17 @@ void op_mpi_wait_all_cuda(int nargs, op_arg *args) { (void)args; } +int op_mpi_halo_exchanges_grouped(op_set set, int nargs, op_arg *args, int device){ + (void)device; + return device == 1 ? op_mpi_halo_exchanges(set, nargs, args) : op_mpi_halo_exchanges_cuda(set, nargs, args); +} + +void op_mpi_wait_all_grouped(int nargs, op_arg *args, int device) { + (void)device; + (void)nargs; + (void)args; +} + void op_mpi_reset_halos(int nargs, op_arg *args) { (void)nargs; (void)args; diff --git a/translator/c/op2_gen_openmp4.py b/translator/c/op2_gen_openmp4.py index ce9e4f17a..c119975c8 100644 --- a/translator/c/op2_gen_openmp4.py +++ b/translator/c/op2_gen_openmp4.py @@ -96,9 +96,10 @@ def op2_gen_openmp4(master, date, consts, kernels): OP_INC = 4; OP_MAX = 5; OP_MIN = 6; accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] - op2_compiler = os.getenv('OP2_COMPILER','0'); - any_soa = 0 + op2_compiler = 'clang' #os.getenv('OP2_COMPILER','0'); maptype = 'map' + any_soa=0 + atomics = True for nk in range (0,len(kernels)): any_soa = any_soa or sum(kernels[nk]['soaflags']) @@ -210,18 +211,18 @@ def op2_gen_openmp4(master, date, consts, kernels): for i in range(0,nargs_novec): var = signature_text.split(',')[i].strip() - if kernels[nk]['soaflags'][i]: + if kernels[nk]['soaflags'][i] and not (atomics and maps[i] == OP_MAP and accs[i] == OP_INC): var = var.replace('*','') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length-1].strip() if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: - body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \ + body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9_]*)\]'+'', var2+r'\1[(\2)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) else: body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) - body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ + body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9_]*)\]'+'', var2+r'[(\1)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) for nc in range(0,len(consts)): @@ -280,7 +281,10 @@ def op2_gen_openmp4(master, date, consts, kernels): params += indent + 'int dat'+str(g_m)+'size,' if ninds>0: # add indirect kernel specific params to kernel func call - params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,' + if atomics: + params += indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,' + else: + params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,' else: # add direct kernel specific params to kernel func call params += indent + 'int count,' @@ -387,7 +391,6 @@ def op2_gen_openmp4(master, date, consts, kernels): ENDIF() code('') comm(' get plan') - code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit @@ -398,7 +401,7 @@ def op2_gen_openmp4(master, date, consts, kernels): code('printf(" kernel routine w/o indirection: '+ name + '");') ENDIF() code('') - code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') + code('int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);') # # get part and block size @@ -425,7 +428,8 @@ def op2_gen_openmp4(master, date, consts, kernels): if ninds > 0: code('') - code('int ncolors = 0;') + if not atomics: + code('int ncolors = 0;') code('int set_size1 = set->size + set->exec_size;') code('') IF('set_size >0') @@ -480,19 +484,36 @@ def op2_gen_openmp4(master, date, consts, kernels): # prepare kernel params for indirect version # if ninds>0: - code('') - code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') - code('ncolors = Plan->ncolors;') - code('int *col_reord = Plan->col_reord;') - code('') - comm(' execute plan') - FOR('col','0','Plan->ncolors') - IF('col==1') - code('op_mpi_wait_all_cuda(nargs, args);') - ENDIF() - code('int start = Plan->col_offsets[0][col];') - code('int end = Plan->col_offsets[0][col+1];') - code('') + if atomics: + if reduct: + FOR('round','0','3') + else: + FOR('round','0','2') + IF('round==1') + code('op_mpi_wait_all_grouped(nargs, args, 2);') + #code('op_mpi_wait_all_cuda(nargs, args);') + ENDIF() + if reduct: + code('int start = round==0 ? 0 : (round==1 ? set->core_size : set->size);') + code('int end = round==0 ? set->core_size : (round==1? set->size : set->size + set->exec_size);') + else: + code('int start = round==0 ? 0 : set->core_size;') + code('int end = round==0 ? set->core_size : set->size + set->exec_size;') + IF('end-start>0') + else: + code('') + code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') + code('ncolors = Plan->ncolors;') + code('int *col_reord = Plan->col_reord;') + code('') + comm(' execute plan') + FOR('col','0','Plan->ncolors') + IF('col==1') + code('op_mpi_wait_all_grouped(nargs, args, 2);') + ENDIF() + code('int start = Plan->col_offsets[0][col];') + code('int end = Plan->col_offsets[0][col+1];') + code('') # # kernel function call # @@ -501,10 +522,11 @@ def op2_gen_openmp4(master, date, consts, kernels): call_params = call_params.replace('*','') # set params for indirect version if ninds>0: + call_params = re.sub(r'\bcount\b','set_size1',call_params); call_params = call_params.replace('num_teams','part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread') # set params for direct version else: - call_params = re.sub('count','set->size',call_params); + call_params = re.sub(r'\bcount\b','set->size',call_params); call_params = call_params.replace('num_teams','part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') code(func_call_signaure_text.split(' ')[-1]+call_params+');') code('') @@ -512,7 +534,10 @@ def op2_gen_openmp4(master, date, consts, kernels): if ninds>0: if reduct: comm(' combine reduction data') - IF('col == Plan->ncolors_owned-1') + if atomics: + IF('round == 1') + else: + IF('col == Plan->ncolors_owned-1') for g_m in range(0,nargs): if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: @@ -525,16 +550,23 @@ def op2_gen_openmp4(master, date, consts, kernels): error('internal error: invalid reduction option') ENDIF() ENDFOR() - code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') - code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') + if not atomics: + code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') + code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') + if ninds>0: + if atomics: + ENDIF() ENDIF() code('') #zero set size issues if ninds>0: - IF('set_size == 0 || set_size == set->core_size || ncolors == 1') - code('op_mpi_wait_all_cuda(nargs, args);') + if atomics: + IF('set_size == 0 || set_size == set->core_size') + else: + IF('set_size == 0 || set_size == set->core_size || ncolors == 1') + code('op_mpi_wait_all_grouped(nargs, args, 2);') ENDIF() # @@ -624,7 +656,7 @@ def op2_gen_openmp4(master, date, consts, kernels): line = '#pragma omp target teams' if op2_compiler == 'clang': line +=' distribute parallel for schedule(static,1)\\\n' + (depth+2)*' ' - line +=' num_teams(num_teams) thread_limit(nthread) ' + line +=' thread_limit(nthread) ' #num_teams(num_teams) map_clause = '' if maptype == 'map': map_clause = 'map(to:' @@ -675,10 +707,13 @@ def op2_gen_openmp4(master, date, consts, kernels): # map extra pointers for indirect version # if ninds>0: - if maptype == 'map': - line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],' + if atomics: + line += '\\\n'+(depth+2)*' '+'map(to:' else: - line += '\\\n'+(depth+2)*' '+'map(to:col_reord,' + if maptype == 'map': + line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],' + else: + line += '\\\n'+(depth+2)*' '+'map(to:col_reord,' if nmaps > 0: k = [] for g_m in range(0,nargs): @@ -706,8 +741,11 @@ def op2_gen_openmp4(master, date, consts, kernels): # start for loop indirect version # if ninds>0: - FOR('e','start','end') - code('int n_op = col_reord[e];') + if atomics: + FOR('n_op', 'start', 'end') + else: + FOR('e','start','end') + code('int n_op = col_reord[e];') if nmaps > 0: k = [] for g_m in range(0,nargs): @@ -779,7 +817,11 @@ def op2_gen_openmp4(master, date, consts, kernels): else: line = '' else: - if soaflags[g_m]: + if atomics and accs[g_m] == OP_INC: + code(typs[g_m] + ' arg'+str(g_m)+'_l['+dims[g_m]+'];') + code('for (int d = 0; d < '+dims[g_m]+';d++) arg'+str(g_m)+'_l[d] = 0;') + line += '&arg'+str(g_m)+'_l[0]' + elif soaflags[g_m]: line += '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]' else: line += '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' @@ -801,6 +843,16 @@ def op2_gen_openmp4(master, date, consts, kernels): code(inline_body_text) comm('end inline func') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and atomics and accs[g_m] == OP_INC: + # FOR('d', '0', dims[g_m]) + for d in range(0,int(dims[g_m])): + code('#pragma omp atomic') + if soaflags[g_m]: + code('data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx + '+str(d)+' * '+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'] += arg%d_l[%d];' % (g_m,d)) + else: + code('data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx + '+str(d)+'] += arg%d_l[%d];' % (g_m,d)) + # ENDFOR() ENDFOR() code('') # end kernel function From 45d6097c4f75ec9fa839df632b5e99f36354d9d9 Mon Sep 17 00:00:00 2001 From: Reguly Reguly Date: Tue, 10 Oct 2023 15:49:07 +0300 Subject: [PATCH 4/5] unsafe atomic adds for float/double only --- translator/c/op2_gen_cuda_simple.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py index 09c646025..312857a22 100644 --- a/translator/c/op2_gen_cuda_simple.py +++ b/translator/c/op2_gen_cuda_simple.py @@ -118,8 +118,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0) #Optimization settings inc_stage=0 - op_color2_force=1 - atomics=0 + op_color2_force=0 + atomics=1 name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ @@ -766,7 +766,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0) IF('optflags & 1<<'+str(optidxs[g_m])) for d in range(0,int(dims[g_m])): atomicAdd = 'atomicAdd' - if hip == 1: + if hip == 1 and (typs[g_m]=='float' or typs[g_m]=='double'): atomicAdd = 'unsafeAtomicAdd' if soaflags[g_m]: code(atomicAdd+'(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx],_l['+str(d)+']);') From 28aa8dfe00a2e0daa69ab43c8bd97ed37e5aa0b9 Mon Sep 17 00:00:00 2001 From: ristvan Date: Wed, 20 Dec 2023 14:09:11 -0800 Subject: [PATCH 5/5] int/size_t mismatches, nvhpc with omp offload --- makefiles/compilers/c/nvhpc.mk | 2 +- op2/src/cuda/op_cuda_rt_support.cpp | 4 ++-- op2/src/openmp4/op_openmp4_decl.cpp | 4 ++-- op2/src/openmp4/op_openmp4_rt_support.cpp | 9 +++++---- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/makefiles/compilers/c/nvhpc.mk b/makefiles/compilers/c/nvhpc.mk index 0245eda5c..68a7af3fc 100644 --- a/makefiles/compilers/c/nvhpc.mk +++ b/makefiles/compilers/c/nvhpc.mk @@ -20,5 +20,5 @@ CONFIG_CPP_HAS_OMP ?= true GPU_FFLAG := -gpu=fastmath,ptxinfo,lineinfo $(foreach arch,$(CUDA_GEN),$(eval GPU_FFLAG := $(GPU_FFLAG),cc$(arch))) -CONFIG_OMP_OFFLOAD_CPPFLAGS ?= -mp=gpu $(GPU_FFLAG) +CONFIG_OMP_OFFLOAD_CXXFLAGS ?= -mp=gpu $(GPU_FFLAG) CONFIG_CPP_HAS_OMP_OFFLOAD ?= true diff --git a/op2/src/cuda/op_cuda_rt_support.cpp b/op2/src/cuda/op_cuda_rt_support.cpp index 52414e54d..673d353c1 100644 --- a/op2/src/cuda/op_cuda_rt_support.cpp +++ b/op2/src/cuda/op_cuda_rt_support.cpp @@ -94,7 +94,7 @@ void __cutilCheckMsg(const char *errorMessage, const char *file, // routines to move arrays to/from GPU device // -void op_mvHostToDevice(void **map, int size) { +void op_mvHostToDevice(void **map, size_t size) { if (!OP_hybrid_gpu || size == 0) return; void *tmp; @@ -105,7 +105,7 @@ void op_mvHostToDevice(void **map, int size) { *map = tmp; } -void op_cpHostToDevice(void **data_d, void **data_h, int size) { +void op_cpHostToDevice(void **data_d, void **data_h, size_t size) { if (!OP_hybrid_gpu) return; if (*data_d != NULL) cutilSafeCall(cudaFree(*data_d)); diff --git a/op2/src/openmp4/op_openmp4_decl.cpp b/op2/src/openmp4/op_openmp4_decl.cpp index da8048c12..d8ef6980f 100644 --- a/op2/src/openmp4/op_openmp4_decl.cpp +++ b/op2/src/openmp4/op_openmp4_decl.cpp @@ -9,9 +9,9 @@ void cutilDeviceInit(int argc, char **argv); -void op_mvHostToDevice(void **map, int size); +void op_mvHostToDevice(void **map, size_t size); -void op_cpHostToDevice(void **data_d, void **data_h, int size); +void op_cpHostToDevice(void **data_d, void **data_h, size_t size); void op_cuda_exit(); diff --git a/op2/src/openmp4/op_openmp4_rt_support.cpp b/op2/src/openmp4/op_openmp4_rt_support.cpp index 078484915..32a59f7ad 100644 --- a/op2/src/openmp4/op_openmp4_rt_support.cpp +++ b/op2/src/openmp4/op_openmp4_rt_support.cpp @@ -19,7 +19,7 @@ // routines to move arrays to/from GPU device // -void op_mvHostToDevice(void **map, int size) { +void op_mvHostToDevice(void **map, size_t size) { if (!OP_hybrid_gpu) return; char *temp = (char*)*map; @@ -28,7 +28,7 @@ void op_mvHostToDevice(void **map, int size) { //TODO test } -void op_cpHostToDevice(void **data_d, void **data_h, int size) { +void op_cpHostToDevice(void **data_d, void **data_h, size_t size) { if (!OP_hybrid_gpu) return; *data_d = (char*)op_malloc(size); @@ -121,8 +121,9 @@ void op_cuda_exit() { return; op_dat_entry *item; TAILQ_FOREACH(item, &OP_dat_list, entries) { - #pragma omp target exit data map(from: (item->dat)->data_d) - free((item->dat)->data_d); + char *data_d = (item->dat)->data_d; + #pragma omp target exit data map(from: data_d) + free(data_d); } /* for (int ip = 0; ip < OP_plan_index; ip++) {