From 3e0790c08e8de660f8d54819af0a488c487f248a Mon Sep 17 00:00:00 2001
From: Reguly Reguly <regulyistvan@gmail.com>
Date: Sun, 4 Dec 2022 14:57:40 +0200
Subject: [PATCH 1/5] HIP on latest develop, working with Cray

---
 makefiles/c_app.mk                    |  20 +-
 makefiles/common.mk                   |  16 +-
 makefiles/compilers.mk                |  12 +
 makefiles/compilers/c/cray.mk         |   2 +-
 makefiles/compilers/c_hip/hip.mk      |   9 +
 makefiles/dependencies/hip.mk         |  33 ++
 makefiles/dependencies/tests/hip.cpp  |   7 +
 op2/Makefile                          |  39 ++
 op2/include/op_hip_reduction.h        | 259 +++++++++
 op2/include/op_hip_rt_support.h       | 137 +++++
 op2/src/hip/op_hip_decl.cpp           | 393 +++++++++++++
 op2/src/hip/op_hip_rt_support.cpp     | 555 +++++++++++++++++++
 op2/src/mpi/op_mpi_hip_decl.cpp       | 749 +++++++++++++++++++++++++
 op2/src/mpi/op_mpi_hip_kernels.cpp    | 453 +++++++++++++++
 op2/src/mpi/op_mpi_hip_rt_support.cpp | 763 ++++++++++++++++++++++++++
 translator/c/op2.py                   |   1 +
 translator/c/op2_gen_cuda_simple.py   |  35 +-
 17 files changed, 3464 insertions(+), 19 deletions(-)
 create mode 100644 makefiles/compilers/c_hip/hip.mk
 create mode 100644 makefiles/dependencies/hip.mk
 create mode 100644 makefiles/dependencies/tests/hip.cpp
 create mode 100644 op2/include/op_hip_reduction.h
 create mode 100644 op2/include/op_hip_rt_support.h
 create mode 100644 op2/src/hip/op_hip_decl.cpp
 create mode 100644 op2/src/hip/op_hip_rt_support.cpp
 create mode 100644 op2/src/mpi/op_mpi_hip_decl.cpp
 create mode 100644 op2/src/mpi/op_mpi_hip_kernels.cpp
 create mode 100644 op2/src/mpi/op_mpi_hip_rt_support.cpp

diff --git a/makefiles/c_app.mk b/makefiles/c_app.mk
index 8de173d26..e392cc5b8 100644
--- a/makefiles/c_app.mk
+++ b/makefiles/c_app.mk
@@ -9,7 +9,7 @@ APP_ENTRY_MPI_BASENAME := $(basename $(APP_ENTRY_MPI))
 APP_ENTRY_OP := $(APP_ENTRY_BASENAME)_op.cpp
 APP_ENTRY_MPI_OP := $(APP_ENTRY_MPI_BASENAME)_op.cpp
 
-ALL_VARIANTS := seq genseq vec openmp openmp4 cuda cuda_hyb
+ALL_VARIANTS := seq genseq vec openmp openmp4 cuda cuda_hyb hip
 ALL_VARIANTS += $(foreach variant,$(ALL_VARIANTS),mpi_$(variant))
 ALL_VARIANTS := $(foreach variant,$(ALL_VARIANTS),$(APP_NAME)_$(variant))
 
@@ -28,6 +28,10 @@ ifeq ($(HAVE_C),true)
   ifeq ($(HAVE_CUDA),true)
     BASE_BUILDABLE_VARIANTS += cuda cuda_hyb
   endif
+
+  ifeq ($(HAVE_HIP),true)
+    BASE_BUILDABLE_VARIANTS += hip
+  endif
 endif
 
 BUILDABLE_VARIANTS :=
@@ -69,7 +73,7 @@ all: $(BUILDABLE_VARIANTS)
 
 clean:
 	-$(RM) $(ALL_VARIANTS)
-	-$(RM) -r seq vec openmp openmp4 cuda openacc
+	-$(RM) -r seq vec openmp openmp4 cuda hip openacc
 	-$(RM) *_op.cpp
 	-$(RM) .generated .generated
 	-$(RM) *.d
@@ -109,6 +113,8 @@ CUDA_HYB_SRC := $(APP_ENTRY_OP) \
 MPI_CUDA_HYB_SRC := $(APP_ENTRY_MPI_OP) \
 	cuda/$(APP_NAME)_mpi_hybkernels_cpu.o cuda/$(APP_NAME)_mpi_hybkernels_gpu.o
 
+HIP_SRC := $(APP_ENTRY_OP) hip/$(APP_NAME)_kernels.o
+MPI_HIP_SRC := $(APP_ENTRY_MPI_OP) hip/$(APP_NAME)_mpi_kernels.o
 
 # $(1) = variant name
 # $(2) = additional flags
@@ -134,10 +140,14 @@ $(eval $(call RULE_template, openmp,   $(OMP_CPPFLAGS),
 $(eval $(call RULE_template, openmp4,  $(OMP_OFFLOAD_CPPFLAGS) -DOP2_WITH_OMP4, OPENMP4,    ))
 $(eval $(call RULE_template, cuda,,                                             CUDA,    MPI_CUDA))
 $(eval $(call RULE_template, cuda_hyb, $(OMP_CPPFLAGS),                         CUDA,    MPI_CUDA))
+$(eval $(call RULE_template, hip,,                                              HIP,     MPI_HIP))
 
 $(APP_NAME)_cuda: cuda/$(APP_NAME)_kernels.o
 $(APP_NAME)_mpi_cuda: cuda/$(APP_NAME)_mpi_kernels.o
 
+$(APP_NAME)_hip: hip/$(APP_NAME)_kernels.o
+$(APP_NAME)_mpi_hip: hip/$(APP_NAME)_mpi_kernels.o
+
 $(APP_NAME)_cuda_hyb: cuda/$(APP_NAME)_hybkernels_gpu.o cuda/$(APP_NAME)_hybkernels_cpu.o
 $(APP_NAME)_mpi_cuda_hyb: cuda/$(APP_NAME)_mpi_hybkernels_gpu.o cuda/$(APP_NAME)_mpi_hybkernels_cpu.o
 
@@ -147,6 +157,12 @@ cuda/$(APP_NAME)_kernels.o: .generated
 cuda/$(APP_NAME)_mpi_kernels.o: .generated
 	$(NVCC) $(NVCCFLAGS) $(OP2_INC) -c cuda/$(APP_ENTRY_MPI_BASENAME)_kernels.cu -o $@
 
+hip/$(APP_NAME)_kernels.o: .generated
+	$(HIPCXX) $(HIPFLAGS) $(OP2_INC) -c hip/$(APP_ENTRY_BASENAME)_kernels.cpp -o $@
+
+hip/$(APP_NAME)_mpi_kernels.o: .generated
+	$(HIPCXX) $(HIPFLAGS) $(OP2_INC) -c hip/$(APP_ENTRY_MPI_BASENAME)_kernels.cpp -o $@
+
 cuda/$(APP_NAME)_hybkernels_gpu.o: .generated
 	$(NVCC) $(NVCCFLAGS) -DOP_HYBRID_GPU -DGPUPASS $(OP2_INC) \
 		-c cuda/$(APP_ENTRY_BASENAME)_hybkernels.cu -o $@
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 4de2a2f01..9e9a3d4fb 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -78,6 +78,13 @@ ifeq ($(MAKECMDGOALS),config)
     $(call info_bold,> C/C++ CUDA compiler $(TEXT_NOTFOUND); skipping search for CUDA libraries)
   endif
 
+  ifeq ($(CONFIG_HAVE_C_HIP),true)
+    $(call info_bold,> C/C++ HIP compiler $(TEXT_FOUND) ($(CONFIG_HIP)); looking for the HIP libraries)
+    include $(DEPS_DIR)/hip.mk
+  else
+    $(call info_bold,> C/C++ HIP compiler $(TEXT_NOTFOUND); skipping search for HIP libraries)
+  endif
+
   $(info )
 
   ifeq ($(CONFIG_HAVE_MPI_C),true)
@@ -139,6 +146,7 @@ ifneq ($(MAKECMDGOALS),clean)
   $(info .   C: $(if $(HAVE_C),$(CC),not found))
   $(info .   C++: $(if $(HAVE_C),$(CXX),not found))
   $(info .   CUDA: $(if $(HAVE_C_CUDA),$(NVCC),not found))
+  $(info .   HIP:  $(if $(HAVE_C_HIP),$(HIPCXX),not found))
   $(info .   Fortran: $(if $(HAVE_F),$(FC),not found))
   $(info )
   $(info MPI compilers:)
@@ -161,14 +169,15 @@ ifneq ($(MAKECMDGOALS),clean)
   $(info .   C: $(CFLAGS))
   $(info .   C++: $(CXXFLAGS))
   $(info .   CUDA: $(NVCCFLAGS))
+  $(info .   HIP:  $(HIPFLAGS))
   $(info .   Fortran: $(FFLAGS))
   $(info )
 endif
 
-OP2_LIBS_SINGLE_NODE := seq cuda openmp openmp4
+OP2_LIBS_SINGLE_NODE := seq cuda hip openmp openmp4
 OP2_FOR_LIBS_SINGLE_NODE := $(foreach lib,$(OP2_LIBS_SINGLE_NODE),f_$(lib))
 
-OP2_LIBS_MPI := mpi mpi_cuda
+OP2_LIBS_MPI := mpi mpi_cuda mpi_hip
 OP2_FOR_LIBS_MPI := $(foreach lib,$(OP2_LIBS_MPI),f_$(lib))
 
 OP2_LIBS := hdf5 $(OP2_LIBS_SINGLE_NODE) $(OP2_LIBS_MPI)
@@ -199,3 +208,6 @@ $(foreach lib,$(OP2_LIBS_MPI),$(eval $(call OP2_LIB_template,$(lib),\
 
 OP2_LIB_CUDA += $(CUDA_LIB)
 OP2_LIB_MPI_CUDA += $(CUDA_LIB)
+
+OP2_LIB_HIP += $(HIP_LIB)
+OP2_LIB_MPI_HIP += $(HIP_LIB)
\ No newline at end of file
diff --git a/makefiles/compilers.mk b/makefiles/compilers.mk
index 74129b4df..8dc4c672d 100644
--- a/makefiles/compilers.mk
+++ b/makefiles/compilers.mk
@@ -2,6 +2,7 @@ ifdef OP2_COMPILER
   OP2_C_COMPILER ?= $(OP2_COMPILER)
   OP2_F_COMPILER ?= $(OP2_COMPILER)
   OP2_C_CUDA_COMPILER ?= nvhpc
+  OP2_C_HIP_COMPILER ?= hip
 endif
 
 # Process CUDA_GEN and NV_ARCH until CUDA_GEN is a whitespace separated list of
@@ -28,6 +29,10 @@ ifdef OP2_C_CUDA_COMPILER
   include $(MAKEFILES_DIR)/compilers/c_cuda/$(OP2_C_CUDA_COMPILER).mk
 endif
 
+ifdef OP2_C_HIP_COMPILER
+  include $(MAKEFILES_DIR)/compilers/c_hip/$(OP2_C_HIP_COMPILER).mk
+endif
+
 ifdef OP2_F_COMPILER
   include $(MAKEFILES_DIR)/compilers/fortran/$(OP2_F_COMPILER).mk
 endif
@@ -48,6 +53,13 @@ ifneq ($(shell which $(CONFIG_NVCC) 2> /dev/null),)
   CONFIG_HAVE_C_CUDA := true
 endif
 
+ifneq ($(shell which $(CONFIG_HIP) 2> /dev/null),)
+  CONFIG_HIP != which $(CONFIG_HIP)
+  CONFIG_HIPCXX = $(CONFIG_HIP)
+  CONFIG_HIPCXX = $(CONFIG_HIP)
+  CONFIG_HAVE_C_HIP := true
+endif
+
 ifneq ($(shell which $(CONFIG_FC) 2> /dev/null),)
   CONFIG_FC != which $(CONFIG_FC)
   CONFIG_HAVE_F := true
diff --git a/makefiles/compilers/c/cray.mk b/makefiles/compilers/c/cray.mk
index 8235376c3..02671eeea 100644
--- a/makefiles/compilers/c/cray.mk
+++ b/makefiles/compilers/c/cray.mk
@@ -23,4 +23,4 @@ CONFIG_OMP_CPPFLAGS ?= -fopenmp
 CONFIG_CPP_HAS_OMP ?= true
 
 # CONFIG_OMP_OFFLOAD_CPPFLAGS ?=
-CONFIG_CPP_HAS_OMP_OFFLOAD ?= false
+CONFIG_CPP_HAS_OMP_OFFLOAD ?= false
\ No newline at end of file
diff --git a/makefiles/compilers/c_hip/hip.mk b/makefiles/compilers/c_hip/hip.mk
new file mode 100644
index 000000000..d8bdf3041
--- /dev/null
+++ b/makefiles/compilers/c_hip/hip.mk
@@ -0,0 +1,9 @@
+CONFIG_HIP ?= hipcc
+
+ifndef DEBUG
+  HIP_OPT := -Ofast
+else
+  HIP_OPT := -g -O0
+endif
+
+CONFIG_HIPFLAGS ?= -x hip --offload-arch=$(HIP_ARCH) $(HIP_OPT)
diff --git a/makefiles/dependencies/hip.mk b/makefiles/dependencies/hip.mk
new file mode 100644
index 000000000..8fc47235a
--- /dev/null
+++ b/makefiles/dependencies/hip.mk
@@ -0,0 +1,33 @@
+ifdef HIP_INSTALL_PATH
+  HIP_INC_PATH := -I$(HIP_INSTALL_PATH)/include
+  HIP_LIB_PATH := -L$(HIP_INSTALL_PATH)/lib64 -L$(HIP_INSTALL_PATH)/lib
+endif
+
+HIP_TEST = $(CONFIG_HIP) $(HIP_INC_PATH) \
+                    $(DEPS_DIR)/tests/hip.cpp $(HIP_LIB_PATH) $(HIP_LINK) \
+                    -o $(DEPS_DIR)/tests/hip
+
+$(file > $(DEP_BUILD_LOG),$(HIP_TEST))
+$(shell $(HIP_TEST) >> $(DEP_BUILD_LOG) 2>&1)
+
+ifneq ($(.SHELLSTATUS),0)
+  HIP_LINK ?= #-lculibos -lpthread -lrt -ldl
+
+  $(file >> $(DEP_BUILD_LOG),$(HIP_TEST))
+  $(shell $(HIP_TEST) >> $(DEP_BUILD_LOG) 2>&1)
+endif
+
+ifeq ($(.SHELLSTATUS),0)
+  $(shell rm -f $(DEPS_DIR)/tests/hip)
+
+  $(call info_bold,  > HIP libraries $(TEXT_FOUND) (link flags: $(or $(HIP_LINK), none)))
+
+  CONFIG_HAVE_HIP := true
+
+  CONFIG_HIP_INC := $(strip $(HIP_INC_PATH) $(HIP_DEF))
+  CONFIG_HIP_LIB := $(strip $(HIP_LIB_PATH) $(HIP_LINK))
+else
+  $(call info_bold,  > HIP libraries $(TEXT_NOTFOUND):)
+  $(info $(file < $(DEP_BUILD_LOG)))
+  $(info )
+endif
diff --git a/makefiles/dependencies/tests/hip.cpp b/makefiles/dependencies/tests/hip.cpp
new file mode 100644
index 000000000..daff0a814
--- /dev/null
+++ b/makefiles/dependencies/tests/hip.cpp
@@ -0,0 +1,7 @@
+#include <hip/hip_runtime.h>
+
+int main() {
+    int deviceCount = 0;
+    auto err = hipGetDeviceCount(&deviceCount);
+    return err;
+}
diff --git a/op2/Makefile b/op2/Makefile
index 37f418acc..c7ff8ac3b 100644
--- a/op2/Makefile
+++ b/op2/Makefile
@@ -32,6 +32,14 @@ ifeq ($(HAVE_C),true)
     endif
   endif
 
+  ifeq ($(HAVE_HIP),true)
+    OP2_BUILDABLE_LIBS += hip
+
+    ifeq ($(HIP_IS_BUILDABLE),true)
+      OP2_BUILDABLE_LIBS += mpi_hip
+    endif
+  endif
+
   ifeq ($(MPI_IS_BUILDABLE),true)
     OP2_BUILDABLE_LIBS += mpi
   endif
@@ -110,6 +118,10 @@ OP2_CUDA := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	cuda/op_cuda_decl.o \
 	cuda/op_cuda_rt_support.o)
 
+OP2_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\
+	hip/op_hip_decl.o \
+	hip/op_hip_rt_support.o)
+
 OP2_FOR_CUDA := $(OP2_CUDA) $(OP2_FOR_BASE_CUDA) $(addprefix $(OBJ)/fortran/,\
 	op2_for_rt_wrappers_cuda.o \
 	cudaConfigurationParams.o)
@@ -153,6 +165,18 @@ OP2_MPI_CUDA := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	externlib/op_util.o \
 	externlib/op_renumber.o)
 
+OP2_MPI_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\
+        hip/op_hip_rt_support+mpi.o \
+        mpi/op_mpi_core.o \
+        mpi/op_mpi_part_core.o \
+        mpi/op_mpi_hip_decl.o \
+        mpi/op_mpi_hip_rt_support.o \
+        mpi/op_mpi_hip_kernels.o \
+        mpi/op_mpi_hdf5.o \
+        mpi/op_mpi_util.o \
+        externlib/op_util.o \
+        externlib/op_renumber.o)
+
 OP2_FOR_MPI_CUDA := $(OP2_MPI_CUDA) $(OP2_FOR_BASE_MPI_CUDA) $(addprefix $(OBJ)/fortran/,\
 	op2_for_rt_wrappers_cuda.o \
 	cudaConfigurationParams.o)
@@ -186,6 +210,12 @@ $(OBJ)/cuda/%+mpi.o: src/cuda/%.cpp | $(OBJ)
 $(OBJ)/cuda/%.o: src/cuda/%.cpp | $(OBJ)
 	$(CXX) $(CXXFLAGS) $(INC) -DSET_CUDA_CACHE_CONFIG -c $< -o $@
 
+$(OBJ)/hip/%+mpi.o: src/hip/%.cpp | $(OBJ)
+	$(MPICXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -DOPMPI -c $< -o $@
+
+$(OBJ)/hip/%.o: src/hip/%.cpp | $(OBJ)
+	$(CXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -c $< -o $@
+
 $(OBJ)/openmp4/%.o: src/openmp4/%.cpp | $(OBJ)
 	$(CXX) $(CXXFLAGS) $(OMP_OFFLOAD_CXXFLAGS) $(INC) -c $< -o $@
 
@@ -195,6 +225,15 @@ $(OBJ)/mpi/%.o: src/mpi/%.cpp | $(OBJ)
 $(OBJ)/mpi/%.o: src/mpi/%.cu | $(OBJ)
 	$(NVCC) $(NVCCFLAGS) $(INC) -c $< -o $@
 
+$(OBJ)/mpi/op_mpi_hip_decl.o: src/mpi/op_mpi_hip_decl.cpp | $(OBJ)
+	$(MPICXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -c $< -o $@
+
+$(OBJ)/mpi/op_mpi_hip_rt_support.o: src/mpi/op_mpi_hip_rt_support.cpp | $(OBJ)
+	$(MPICXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -c $< -o $@
+
+$(OBJ)/mpi/op_mpi_hip_kernels.o: src/mpi/op_mpi_hip_kernels.cpp | $(OBJ)
+	$(MPICXX) $(CXXFLAGS) $(HIPFLAGS) $(INC) -c $< -o $@
+
 $(OBJ)/fortran/%+mpi.o: src/fortran/%.c | $(OBJ)
 	$(MPICC) $(CFLAGS) $(INC) -DOPMPI -c $< -o $@
 
diff --git a/op2/include/op_hip_reduction.h b/op2/include/op_hip_reduction.h
new file mode 100644
index 000000000..bc21a66e7
--- /dev/null
+++ b/op2/include/op_hip_reduction.h
@@ -0,0 +1,259 @@
+#include "hip/hip_runtime.h"
+/*
+ * Open source copyright declaration based on BSD open source template:
+ * http://www.opensource.org/licenses/bsd-license.php
+ *
+ * This file is part of the OP2 distribution.
+ *
+ * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in
+ * the main source directory for a full list of copyright holders.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of Mike Giles may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OP_HIP_REDUCTION_H
+#define __OP_HIP_REDUCTION_H
+
+/*
+ * This file provides an optimised implementation for reduction of OP2 global
+ * variables.
+ * It is separated from the op_cuda_rt_support.h file because the reduction code
+ * is based on C++ templates, while the other file only includes C routines.
+ */
+
+#include <hip/hip_runtime.h>
+
+/*
+ * reduction routine for arbitrary datatypes
+ */
+
+template <op_access reduction, class T>
+__inline__ __device__ void op_reduction(volatile T *dat_g, T dat_l) {
+  extern __shared__ volatile double temp2[];
+  __shared__ volatile T *temp;
+  temp = (T *)temp2;
+  T dat_t;
+
+  __syncthreads(); /* important to finish all previous activity */
+
+  int tid = threadIdx.x;
+  temp[tid] = dat_l;
+
+  // first, cope with blockDim.x perhaps not being a power of 2
+
+  __syncthreads();
+
+  int d = 1 << (31 - __clz(((int)blockDim.x - 1)));
+  // d = blockDim.x/2 rounded up to nearest power of 2
+
+  if (tid + d < blockDim.x) {
+    dat_t = temp[tid + d];
+
+    switch (reduction) {
+    case OP_INC:
+      dat_l = dat_l + dat_t;
+      break;
+    case OP_MIN:
+      if (dat_t < dat_l)
+        dat_l = dat_t;
+      break;
+    case OP_MAX:
+      if (dat_t > dat_l)
+        dat_l = dat_t;
+      break;
+    }
+
+    temp[tid] = dat_l;
+  }
+
+  // second, do reductions involving more than one warp
+
+  for (d >>= 1; d > warpSize; d >>= 1) {
+    __syncthreads();
+
+    if (tid < d) {
+      dat_t = temp[tid + d];
+
+      switch (reduction) {
+      case OP_INC:
+        dat_l = dat_l + dat_t;
+        break;
+      case OP_MIN:
+        if (dat_t < dat_l)
+          dat_l = dat_t;
+        break;
+      case OP_MAX:
+        if (dat_t > dat_l)
+          dat_l = dat_t;
+        break;
+      }
+
+      temp[tid] = dat_l;
+    }
+  }
+
+  // third, do reductions involving just one warp
+
+  __syncthreads();
+
+  if (tid < warpSize) {
+    for (; d > 0; d >>= 1) {
+      // __syncwarp();
+      if (tid < d) {
+        dat_t = temp[tid + d];
+
+        switch (reduction) {
+        case OP_INC:
+          dat_l = dat_l + dat_t;
+          break;
+        case OP_MIN:
+          if (dat_t < dat_l)
+            dat_l = dat_t;
+          break;
+        case OP_MAX:
+          if (dat_t > dat_l)
+            dat_l = dat_t;
+          break;
+        }
+
+        temp[tid] = dat_l;
+      }
+    }
+
+    // finally, update global reduction variable
+
+    if (tid == 0) {
+      switch (reduction) {
+      case OP_INC:
+        *dat_g = *dat_g + dat_l;
+        break;
+      case OP_MIN:
+        if (dat_l < *dat_g)
+          *dat_g = dat_l;
+        break;
+      case OP_MAX:
+        if (dat_l > *dat_g)
+          *dat_g = dat_l;
+        break;
+      }
+    }
+  }
+}
+
+/*
+ * reduction routine for arbitrary datatypes
+ * (alternative version using just one warp)
+ *
+ */
+
+template <op_access reduction, class T>
+__inline__ __device__ void op_reduction_alt(volatile T *dat_g, T dat_l) {
+  extern __shared__ volatile double temp2[];
+  __shared__ volatile T *temp;
+  temp = (T *)temp2;
+  T dat_t;
+
+  __syncthreads(); /* important to finish all previous activity */
+
+  int tid = threadIdx.x;
+  temp[tid] = dat_l;
+
+  __syncthreads();
+
+  // set number of active threads
+
+  int d = warpSize;
+
+  if (blockDim.x < warpSize)
+    d = 1 << (31 - __clz((int)blockDim.x));
+  // this gives blockDim.x rounded down to nearest power of 2
+
+  if (tid < d) {
+
+    // first, do reductions for each thread
+
+    for (int t = tid + d; t < blockDim.x; t += d) {
+      dat_t = temp[t];
+
+      switch (reduction) {
+      case OP_INC:
+        dat_l = dat_l + dat_t;
+        break;
+      case OP_MIN:
+        if (dat_t < dat_l)
+          dat_l = dat_t;
+        break;
+      case OP_MAX:
+        if (dat_t > dat_l)
+          dat_l = dat_t;
+        break;
+      }
+    }
+
+    temp[tid] = dat_l;
+
+    // second, do reductions to combine thread reductions
+
+    for (d >>= 1; d > 0; d >>= 1) {
+      if (tid < d) {
+        dat_t = temp[tid + d];
+
+        switch (reduction) {
+        case OP_INC:
+          dat_l = dat_l + dat_t;
+          break;
+        case OP_MIN:
+          if (dat_t < dat_l)
+            dat_l = dat_t;
+          break;
+        case OP_MAX:
+          if (dat_t > dat_l)
+            dat_l = dat_t;
+          break;
+        }
+
+        temp[tid] = dat_l;
+      }
+    }
+
+    // finally, update global reduction variable
+
+    if (tid == 0) {
+      switch (reduction) {
+      case OP_INC:
+        *dat_g = *dat_g + dat_l;
+        break;
+      case OP_MIN:
+        if (dat_l < *dat_g)
+          *dat_g = dat_l;
+        break;
+      case OP_MAX:
+        if (dat_l > *dat_g)
+          *dat_g = dat_l;
+        break;
+      }
+    }
+  }
+}
+
+#endif /* __OP_HIP_REDUCTION_H */
diff --git a/op2/include/op_hip_rt_support.h b/op2/include/op_hip_rt_support.h
new file mode 100644
index 000000000..78eb42908
--- /dev/null
+++ b/op2/include/op_hip_rt_support.h
@@ -0,0 +1,137 @@
+/*
+ * Open source copyright declaration based on BSD open source template:
+ * http://www.opensource.org/licenses/bsd-license.php
+ *
+ * This file is part of the OP2 distribution.
+ *
+ * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in
+ * the main source directory for a full list of copyright holders.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of Mike Giles may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OP_HIP_RT_SUPPORT_H
+#define __OP_HIP_RT_SUPPORT_H
+
+/*
+ * This header file declares the HIP back-end specific run-time functions
+ * to be used by the code generated by OP2 compiler.
+ */
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#include <op_lib_core.h>
+#include <op_rt_support.h>
+
+/* define HIP warpsize for OP2 */
+
+#define OP_WARPSIZE 32
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Global variables actually defined in the corresponding c file
+ */
+extern char *OP_consts_h, *OP_consts_d, *OP_reduct_h, *OP_reduct_d;
+
+extern void __syncthreads();
+
+extern hipStream_t op2_grp_secondary;
+
+/*
+ * personal stripped-down version of cutil_inline.h
+ */
+
+#define cutilSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
+#define cutilCheckMsg(msg) __cutilCheckMsg(msg, __FILE__, __LINE__)
+
+void __cudaSafeCall(hipError_t err, const char *file, const int line);
+
+void __cutilCheckMsg(const char *errorMessage, const char *file,
+                     const int line);
+
+void cutilDeviceInit(int argc, char **argv);
+
+void cutilDeviceInit_mpi(int argc, char **argv, int mpi_rank);
+
+/*
+ * routines to move arrays to/from GPU device
+ */
+
+void op_mvHostToDevice(void **map, int size);
+
+void op_cpHostToDevice(void **data_d, void **data_h, int size);
+
+void op_cuda_get_data(op_dat dat);
+
+/*
+ * Plan interface for generated code: the implementation changes depending
+ * on the actual back-end libraries (e.g. cuda or openmp) and it is
+ * hence declared here for cuda.
+ * To avoid linking various header files together, the design requires the
+ * declaration of this function as back-end specific, while the common
+ * op_rt_support.h header file only declares the low-level functions
+ * (e.g. op_plan_core)
+ */
+op_plan *op_plan_get(char const *name, op_set set, int part_size, int nargs,
+                     op_arg *args, int ninds, int *inds);
+
+op_plan *op_plan_get_stage(char const *name, op_set set, int part_size,
+                           int nargs, op_arg *args, int ninds, int *inds,
+                           int staging);
+
+op_plan *op_plan_get_stage_upload(char const *name, op_set set, int part_size,
+                           int nargs, op_arg *args, int ninds, int *inds,
+                           int staging, int upload);
+
+void op_cuda_exit();
+
+/*
+ * routines to resize constant/reduct arrays, if necessary
+ */
+
+void reallocConstArrays(int consts_bytes);
+
+void reallocReductArrays(int reduct_bytes);
+
+/*
+ * routines to move constant/reduct arrays
+ */
+
+void mvConstArraysToDevice(int consts_bytes);
+
+void mvConstArraysToHost(int consts_bytes);
+
+void mvReductArraysToDevice(int reduct_bytes);
+
+void mvReductArraysToHost(int reduct_bytes);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OP_HIP_RT_SUPPORT_H */
diff --git a/op2/src/hip/op_hip_decl.cpp b/op2/src/hip/op_hip_decl.cpp
new file mode 100644
index 000000000..e3b3dc586
--- /dev/null
+++ b/op2/src/hip/op_hip_decl.cpp
@@ -0,0 +1,393 @@
+/*
+ * Open source copyright declaration based on BSD open source template:
+ * http://www.opensource.org/licenses/bsd-license.php
+ *
+ * This file is part of the OP2 distribution.
+ *
+ * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in
+ * the main source directory for a full list of copyright holders.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of Mike Giles may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// This file implements the OP2 user-level functions for the CUDA backend
+//
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#include <op_hip_rt_support.h>
+#include <op_lib_c.h>
+#include <op_rt_support.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// CUDA-specific OP2 functions
+//
+
+void op_init_soa(int argc, char **argv, int diags, int soa) {
+  OP_auto_soa = soa;
+  op_init(argc, argv, diags);
+}
+
+void op_init(int argc, char **argv, int diags) {
+  op_init_core(argc, argv, diags);
+
+  cutilDeviceInit(argc, argv);
+
+//
+// The following call is only made in the C version of OP2,
+// as it causes memory trashing when called from Fortran.
+// \warning add -DSET_CUDA_CACHE_CONFIG to compiling line
+// for this file when implementing C OP2.
+//
+
+#ifdef SET_CUDA_CACHE_CONFIG
+  cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared));
+#endif
+
+  cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared));
+  cutilSafeCall(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte));
+  printf("\n 16/48 L1/shared \n");
+}
+
+void op_mpi_init(int argc, char **argv, int diags, int global, int local) {
+  op_init_core(argc, argv, diags);
+
+
+  cutilDeviceInit(argc, argv);
+
+//
+// The following call is only made in the C version of OP2,
+// as it causes memory trashing when called from Fortran.
+// \warning add -DSET_CUDA_CACHE_CONFIG to compiling line
+// for this file when implementing C OP2.
+//
+
+#ifdef SET_CUDA_CACHE_CONFIG
+  cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared));
+#endif
+
+  cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared));
+  cutilSafeCall(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte));
+  printf("\n 16/48 L1/shared \n");
+}
+
+void op_mpi_init_soa(int argc, char **argv, int diags, int global, int local,
+                     int soa) {
+  OP_auto_soa = soa;
+  op_mpi_init(argc, argv, diags, global, local);
+}
+
+op_dat op_decl_dat_char(op_set set, int dim, char const *type, int size,
+                        char *data, char const *name) {
+  op_dat dat = op_decl_dat_core(set, dim, type, size, data, name);
+
+  // transpose data
+  size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
+  if (strstr(type, ":soa") != NULL || (OP_auto_soa && dim > 1)) {
+    char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
+    int element_size = dat->size / dat->dim;
+    for (int i = 0; i < dat->dim; i++) {
+      for (int j = 0; j < set_size; j++) {
+        for (int c = 0; c < element_size; c++) {
+          temp_data[element_size * i * set_size + element_size * j + c] =
+              data[dat->size * j + element_size * i + c];
+        }
+      }
+    }
+    op_cpHostToDevice((void **)&(dat->data_d), (void **)&(temp_data),
+                      dat->size * set_size);
+    free(temp_data);
+  } else {
+    op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data),
+                      dat->size * set_size);
+  }
+
+  return dat;
+}
+
+op_dat op_decl_dat_temp_char(op_set set, int dim, char const *type, int size,
+                             char const *name) {
+  char *data = NULL;
+  op_dat dat = op_decl_dat_temp_core(set, dim, type, size, data, name);
+
+  for (size_t i = 0; i < set->size * dim * size; i++)
+    dat->data[i] = 0;
+  dat->user_managed = 0;
+
+	op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data),
+                    dat->size * set->size);
+
+  return dat;
+}
+
+int op_free_dat_temp_char(op_dat dat) {
+  // free data on device
+  cutilSafeCall(hipFree(dat->data_d));
+
+  return op_free_dat_temp_core(dat);
+}
+
+op_set op_decl_set(int size, char const *name) {
+  return op_decl_set_core(size, name);
+}
+
+op_map op_decl_map(op_set from, op_set to, int dim, int *imap,
+                   char const *name) {
+  op_map map = op_decl_map_core(from, to, dim, imap, name);
+  int set_size = map->from->size + map->from->exec_size;
+  int *temp_map = (int *)malloc(map->dim * set_size * sizeof(int));
+  for (int i = 0; i < map->dim; i++) {
+    for (int j = 0; j < set_size; j++) {
+      temp_map[i * set_size + j] = map->map[map->dim * j + i];
+    }
+  }
+  op_cpHostToDevice((void **)&(map->map_d), (void **)&(temp_map),
+                    map->dim * set_size * sizeof(int));
+  free(temp_map);
+  return map;
+}
+
+op_arg op_arg_dat(op_dat dat, int idx, op_map map, int dim, char const *type,
+                  op_access acc) {
+  return op_arg_dat_core(dat, idx, map, dim, type, acc);
+}
+
+op_arg op_opt_arg_dat(int opt, op_dat dat, int idx, op_map map, int dim,
+                      char const *type, op_access acc) {
+  return op_opt_arg_dat_core(opt, dat, idx, map, dim, type, acc);
+}
+
+op_arg op_arg_gbl_char(char *data, int dim, const char *type, int size,
+                       op_access acc) {
+  return op_arg_gbl_core(1, data, dim, type, size, acc);
+}
+
+op_arg op_opt_arg_gbl_char(int opt, char *data, int dim, const char *type,
+                           int size, op_access acc) {
+  return op_arg_gbl_core(opt, data, dim, type, size, acc);
+}
+
+//
+// This function is defined in the generated master kernel file
+// so that it is possible to check on the runtime size of the
+// data in cases where it is not known at compile time
+//
+
+/*
+void
+op_decl_const_char ( int dim, char const * type, int size, char * dat,
+                     char const * name )
+{
+  cutilSafeCall ( hipMemcpyToSymbol(HIP_SYMBOL(name), dat, dim * size, 0,
+                                       hipMemcpyHostToDevice ) );
+}
+*/
+
+int op_get_size(op_set set) { return set->size; }
+
+void op_printf(const char *format, ...) {
+  va_list argptr;
+  va_start(argptr, format);
+  vprintf(format, argptr);
+  va_end(argptr);
+}
+
+void op_print(const char *line) { printf("%s\n", line); }
+
+void op_timers(double *cpu, double *et) { op_timers_core(cpu, et); }
+
+int getSetSizeFromOpArg(op_arg *arg) {
+  return arg->opt ? arg->dat->set->size : 0;
+}
+
+void op_renumber(op_map base) { (void)base; }
+
+void op_renumber_ptr(int *ptr){};
+
+int getHybridGPU() { return OP_hybrid_gpu; }
+
+void op_exit() {
+  op_cuda_exit(); // frees dat_d memory
+  op_rt_exit();   // frees plan memory
+  op_exit_core(); // frees lib core variables
+}
+
+void op_timing_output() {
+  op_timing_output_core();
+  printf("Total plan time: %8.4f\n", OP_plan_time);
+}
+
+void op_timings_to_csv(const char *outputFileName) {
+  FILE *outputFile = fopen(outputFileName, "w");
+  if (outputFile == NULL) {
+    printf("ERROR: Failed to open file for writing: '%s'\n", outputFileName);
+  }
+  else {
+    fprintf(outputFile, "rank,thread,nranks,nthreads,count,total time,plan time,mpi time,GB used,GB total,kernel name\n");
+  }
+
+  if (outputFile != NULL) {
+    for (int n = 0; n < OP_kern_max; n++) {
+      if (OP_kernels[n].count > 0) {
+        if (OP_kernels[n].ntimes == 1 && OP_kernels[n].times[0] == 0.0f &&
+            OP_kernels[n].time != 0.0f) {
+          // This library is being used by an OP2 translation made with the
+          // older
+          // translator with older timing logic. Adjust to new logic:
+          OP_kernels[n].times[0] = OP_kernels[n].time;
+        }
+
+        for (int thr=0; thr<OP_kernels[n].ntimes; thr++) {
+          double kern_time = OP_kernels[n].times[thr];
+          if (kern_time == 0.0) {
+            continue;
+          }
+          double plan_time = OP_kernels[n].plan_time;
+          double mpi_time = OP_kernels[n].mpi_time;
+          fprintf(outputFile, "%d,%d,%d,%d,%d,%f,%f,%f,%f,%f,%s\n", 0, thr, 1,
+                  OP_kernels[n].ntimes, OP_kernels[n].count, kern_time,
+                  plan_time, mpi_time, OP_kernels[n].transfer / 1e9f,
+                  OP_kernels[n].transfer2 / 1e9f, OP_kernels[n].name);
+        }
+      }
+    }
+
+    fclose(outputFile);
+  }
+}
+
+void op_print_dat_to_binfile(op_dat dat, const char *file_name) {
+  // need to get data from GPU
+  op_cuda_get_data(dat);
+  op_print_dat_to_binfile_core(dat, file_name);
+}
+
+void op_print_dat_to_txtfile(op_dat dat, const char *file_name) {
+  // need to get data from GPU
+  op_cuda_get_data(dat);
+  op_print_dat_to_txtfile_core(dat, file_name);
+}
+
+void op_upload_all() {
+  op_dat_entry *item;
+  TAILQ_FOREACH(item, &OP_dat_list, entries) {
+    op_dat dat = item->dat;
+    size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
+    if (dat->data_d) {
+      if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+        char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
+        int element_size = dat->size / dat->dim;
+        for (int i = 0; i < dat->dim; i++) {
+          for (int j = 0; j < set_size; j++) {
+            for (int c = 0; c < element_size; c++) {
+              temp_data[element_size * i * set_size + element_size * j + c] =
+                  dat->data[dat->size * j + element_size * i + c];
+            }
+          }
+        }
+        cutilSafeCall(hipMemcpy(dat->data_d, temp_data, dat->size * set_size,
+                                 hipMemcpyHostToDevice));
+        dat->dirty_hd = 0;
+        free(temp_data);
+      } else {
+        cutilSafeCall(hipMemcpy(dat->data_d, dat->data, dat->size * set_size,
+                                 hipMemcpyHostToDevice));
+        dat->dirty_hd = 0;
+      }
+    }
+  }
+}
+
+void op_fetch_data_char(op_dat dat, char *usr_ptr) {
+  op_cuda_get_data(dat);
+  // need to copy data into memory pointed to by usr_ptr
+  memcpy((void *)usr_ptr, (void *)dat->data, dat->set->size * dat->size);
+}
+
+void op_fetch_data_idx_char(op_dat dat, char *usr_ptr, int low, int high) {
+  op_cuda_get_data(dat);
+  if (low < 0 || high > dat->set->size - 1) {
+    printf("op_fetch_data: Indices not within range of elements held in %s\n",
+           dat->name);
+    exit(2);
+  }
+  // need to copy data into memory pointed to by usr_ptr
+  memcpy((void *)usr_ptr, (void *)&dat->data[low * dat->size],
+         (high + 1) * dat->size);
+}
+
+// Dummy for cuda compile
+
+typedef struct {
+} op_export_core;
+
+typedef op_export_core *op_export_handle;
+
+typedef struct {
+} op_import_core;
+
+typedef op_import_core *op_import_handle;
+
+op_import_handle op_import_init_size(int nprocs, int *proclist, op_dat mark) {
+
+  exit(1);
+}
+
+op_import_handle op_import_init(op_export_handle exp_handle, op_dat coords,
+                                op_dat mark) {
+
+  exit(1);
+}
+
+op_export_handle op_export_init(int nprocs, int *proclist, op_map cellsToNodes,
+                                op_set sp_nodes, op_dat coords, op_dat mark) {
+
+  exit(1);
+}
+
+void op_theta_init(op_export_handle handle, int *bc_id, double *dtheta_exp,
+                   double *dtheta_imp, double *alpha) {
+
+  exit(1);
+}
+
+void op_inc_theta(op_export_handle handle, int *bc_id, double *dtheta_exp,
+                  double *dtheta_imp) {
+
+  exit(1);
+}
+
+void op_export_data(op_export_handle handle, op_dat dat) { exit(1); }
+
+void op_import_data(op_import_handle handle, op_dat dat) { exit(1); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/op2/src/hip/op_hip_rt_support.cpp b/op2/src/hip/op_hip_rt_support.cpp
new file mode 100644
index 000000000..c0ae1e637
--- /dev/null
+++ b/op2/src/hip/op_hip_rt_support.cpp
@@ -0,0 +1,555 @@
+/*
+ * Open source copyright declaration based on BSD open source template:
+ * http://www.opensource.org/licenses/bsd-license.php
+ *
+ * This file is part of the OP2 distribution.
+ *
+ * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in
+ * the main source directory for a full list of copyright holders.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of Mike Giles may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// This file implements the CUDA-specific run-time support functions
+//
+
+//
+// header files
+//
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#include <op_hip_rt_support.h>
+#include <op_lib_c.h>
+#include <op_lib_core.h>
+#include <op_rt_support.h>
+
+// Small re-declaration to avoid using struct in the C version.
+// This is due to the different way in which C and C++ see structs
+
+typedef struct hipDeviceProp_t cudaDeviceProp_t;
+
+
+// arrays for global constants and reductions
+
+int OP_consts_bytes = 0, OP_reduct_bytes = 0;
+
+char *OP_consts_h, *OP_consts_d, *OP_reduct_h, *OP_reduct_d;
+
+//
+// CUDA utility functions
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void __cudaSafeCall(hipError_t err, const char *file, const int line) {
+  if (hipSuccess != err) {
+    fprintf(stderr, "%s(%i) : cutilSafeCall() Runtime API error : %s.\n", file,
+            line, hipGetErrorString(err));
+    exit(-1);
+  }
+}
+
+void __cutilCheckMsg(const char *errorMessage, const char *file,
+                     const int line) {
+  hipError_t err = hipGetLastError();
+  if (hipSuccess != err) {
+    fprintf(stderr, "%s(%i) : cutilCheckMsg() error : %s : %s.\n", file, line,
+            errorMessage, hipGetErrorString(err));
+    exit(-1);
+  }
+}
+
+//
+// routines to move arrays to/from GPU device
+//
+
+void op_mvHostToDevice(void **map, int size) {
+  if (!OP_hybrid_gpu || size == 0)
+    return;
+  void *tmp;
+  cutilSafeCall(hipMalloc(&tmp, size));
+  cutilSafeCall(hipMemcpy(tmp, *map, size, hipMemcpyHostToDevice));
+  cutilSafeCall(hipDeviceSynchronize());
+  free(*map);
+  *map = tmp;
+}
+
+void op_cpHostToDevice(void **data_d, void **data_h, int size) {
+  if (!OP_hybrid_gpu)
+    return;
+  if (*data_d != NULL) cutilSafeCall(hipFree(*data_d));
+  cutilSafeCall(hipMalloc(data_d, size));
+  cutilSafeCall(hipMemcpy(*data_d, *data_h, size, hipMemcpyHostToDevice));
+  cutilSafeCall(hipDeviceSynchronize());
+}
+
+op_plan *op_plan_get(char const *name, op_set set, int part_size, int nargs,
+                     op_arg *args, int ninds, int *inds) {
+  return op_plan_get_stage(name, set, part_size, nargs, args, ninds, inds,
+                           OP_STAGE_ALL);
+}
+
+op_plan *op_plan_get_stage(char const *name, op_set set, int part_size,
+                           int nargs, op_arg *args, int ninds, int *inds,
+                           int staging) {
+  return op_plan_get_stage_upload(name, set, part_size, nargs, args, ninds, inds,
+                           staging, 1);
+}
+
+op_plan *op_plan_get_stage_upload(char const *name, op_set set, int part_size,
+                           int nargs, op_arg *args, int ninds, int *inds,
+                           int staging, int upload) {
+  op_plan *plan =
+      op_plan_core(name, set, part_size, nargs, args, ninds, inds, staging);
+  if (!OP_hybrid_gpu || !upload)
+    return plan;
+
+  int set_size = set->size;
+  for (int i = 0; i < nargs; i++) {
+    if (args[i].idx != -1 && args[i].acc != OP_READ) {
+      set_size += set->exec_size;
+      break;
+    }
+  }
+
+  if (plan->count == 1) {
+    int *offsets = (int *)malloc((plan->ninds_staged + 1) * sizeof(int));
+    offsets[0] = 0;
+    for (int m = 0; m < plan->ninds_staged; m++) {
+      int count = 0;
+      for (int m2 = 0; m2 < nargs; m2++)
+        if (plan->inds_staged[m2] == m)
+          count++;
+      offsets[m + 1] = offsets[m] + count;
+    }
+    op_mvHostToDevice((void **)&(plan->ind_map),
+                      offsets[plan->ninds_staged] * set_size * sizeof(int));
+    for (int m = 0; m < plan->ninds_staged; m++) {
+      plan->ind_maps[m] = &plan->ind_map[set_size * offsets[m]];
+    }
+    free(offsets);
+
+    int counter = 0;
+    for (int m = 0; m < nargs; m++)
+      if (plan->loc_maps[m] != NULL)
+        counter++;
+    op_mvHostToDevice((void **)&(plan->loc_map),
+                      sizeof(short) * counter * set_size);
+    counter = 0;
+    for (int m = 0; m < nargs; m++)
+      if (plan->loc_maps[m] != NULL) {
+        plan->loc_maps[m] = &plan->loc_map[set_size * counter];
+        counter++;
+      }
+
+    op_mvHostToDevice((void **)&(plan->ind_sizes),
+                      sizeof(int) * plan->nblocks * plan->ninds_staged);
+    op_mvHostToDevice((void **)&(plan->ind_offs),
+                      sizeof(int) * plan->nblocks * plan->ninds_staged);
+    op_mvHostToDevice((void **)&(plan->nthrcol), sizeof(int) * plan->nblocks);
+    op_mvHostToDevice((void **)&(plan->thrcol), sizeof(int) * set_size);
+    op_mvHostToDevice((void **)&(plan->col_reord), sizeof(int) * set_size);
+    op_mvHostToDevice((void **)&(plan->offset), sizeof(int) * plan->nblocks);
+    plan->offset_d = plan->offset;
+    op_mvHostToDevice((void **)&(plan->nelems), sizeof(int) * plan->nblocks);
+    plan->nelems_d = plan->nelems;
+    op_mvHostToDevice((void **)&(plan->blkmap), sizeof(int) * plan->nblocks);
+    plan->blkmap_d = plan->blkmap;
+  }
+
+  return plan;
+}
+
+void op_cuda_exit() {
+  if (!OP_hybrid_gpu)
+    return;
+  op_dat_entry *item;
+  TAILQ_FOREACH(item, &OP_dat_list, entries) {
+    cutilSafeCall(hipFree((item->dat)->data_d));
+  }
+
+  for (int ip = 0; ip < OP_plan_index; ip++) {
+    OP_plans[ip].ind_map = NULL;
+    OP_plans[ip].loc_map = NULL;
+    OP_plans[ip].ind_sizes = NULL;
+    OP_plans[ip].ind_offs = NULL;
+    OP_plans[ip].nthrcol = NULL;
+    OP_plans[ip].thrcol = NULL;
+    OP_plans[ip].col_reord = NULL;
+    OP_plans[ip].offset = NULL;
+    OP_plans[ip].nelems = NULL;
+    OP_plans[ip].blkmap = NULL;
+  }
+  // hipDeviceReset ( );
+}
+
+//
+// routines to resize constant/reduct arrays, if necessary
+//
+
+void reallocConstArrays(int consts_bytes) {
+  if (consts_bytes > OP_consts_bytes) {
+    if (OP_consts_bytes > 0) {
+      free(OP_consts_h);
+      cutilSafeCall(hipFree(OP_consts_d));
+    }
+    OP_consts_bytes = 4 * consts_bytes; // 4 is arbitrary, more than needed
+    OP_consts_h = (char *)malloc(OP_consts_bytes);
+    cutilSafeCall(hipMalloc((void **)&OP_consts_d, OP_consts_bytes));
+  }
+}
+
+void reallocReductArrays(int reduct_bytes) {
+  if (reduct_bytes > OP_reduct_bytes) {
+    if (OP_reduct_bytes > 0) {
+      free(OP_reduct_h);
+      cutilSafeCall(hipFree(OP_reduct_d));
+    }
+    OP_reduct_bytes = 4 * reduct_bytes; // 4 is arbitrary, more than needed
+    OP_reduct_h = (char *)malloc(OP_reduct_bytes);
+    cutilSafeCall(hipMalloc((void **)&OP_reduct_d, OP_reduct_bytes));
+  }
+}
+
+//
+// routines to move constant/reduct arrays
+//
+
+void mvConstArraysToDevice(int consts_bytes) {
+  cutilSafeCall(hipMemcpy(OP_consts_d, OP_consts_h, consts_bytes,
+                           hipMemcpyHostToDevice));
+  cutilSafeCall(hipDeviceSynchronize());
+}
+
+void mvConstArraysToHost(int consts_bytes) {
+  cutilSafeCall(hipMemcpy(OP_consts_h, OP_consts_d, consts_bytes,
+                           hipMemcpyDeviceToHost));
+  cutilSafeCall(hipDeviceSynchronize());
+}
+
+void mvReductArraysToDevice(int reduct_bytes) {
+  cutilSafeCall(hipMemcpy(OP_reduct_d, OP_reduct_h, reduct_bytes,
+                           hipMemcpyHostToDevice));
+  cutilSafeCall(hipDeviceSynchronize());
+}
+
+void mvReductArraysToHost(int reduct_bytes) {
+  cutilSafeCall(hipMemcpy(OP_reduct_h, OP_reduct_d, reduct_bytes,
+                           hipMemcpyDeviceToHost));
+  cutilSafeCall(hipDeviceSynchronize());
+}
+
+//
+// routine to fetch data from GPU to CPU (with transposing SoA to AoS if needed)
+//
+
+void op_cuda_get_data(op_dat dat) {
+  if (!OP_hybrid_gpu)
+    return;
+  if (dat->dirty_hd == 2)
+    dat->dirty_hd = 0;
+  else
+    return;
+  // transpose data
+  size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
+  if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+    char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
+    cutilSafeCall(hipMemcpy(temp_data, dat->data_d, dat->size * set_size,
+                             hipMemcpyDeviceToHost));
+    cutilSafeCall(hipDeviceSynchronize());
+    int element_size = dat->size / dat->dim;
+    for (int i = 0; i < dat->dim; i++) {
+      for (int j = 0; j < set_size; j++) {
+        for (int c = 0; c < element_size; c++) {
+          dat->data[dat->size * j + element_size * i + c] =
+              temp_data[element_size * i * set_size + element_size * j +
+                        c];
+        }
+      }
+    }
+    free(temp_data);
+  } else {
+    cutilSafeCall(hipMemcpy(dat->data, dat->data_d, dat->size * set_size,
+                             hipMemcpyDeviceToHost));
+    cutilSafeCall(hipDeviceSynchronize());
+  }
+}
+
+void deviceSync() {
+  cutilSafeCall(hipDeviceSynchronize());
+}
+
+#ifndef OPMPI
+
+void cutilDeviceInit(int argc, char **argv) {
+  (void)argc;
+  (void)argv;
+  int deviceCount;
+  cutilSafeCall(hipGetDeviceCount(&deviceCount));
+  if (deviceCount == 0) {
+    printf("cutil error: no devices supporting CUDA\n");
+    exit(-1);
+  }
+
+  // Test we have access to a device
+  float *test;
+  hipError_t err = hipMalloc((void **)&test, sizeof(float));
+  if (err != hipSuccess) {
+    OP_hybrid_gpu = 0;
+  } else {
+    OP_hybrid_gpu = 1;
+  }
+  if (OP_hybrid_gpu) {
+    hipFree(test);
+
+    cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferL1));
+
+    int deviceId = -1;
+    hipGetDevice(&deviceId);
+    cudaDeviceProp_t deviceProp;
+    cutilSafeCall(hipGetDeviceProperties(&deviceProp, deviceId));
+    printf("\n Using CUDA device: %d %s\n", deviceId, deviceProp.name);
+  } else {
+    printf("\n Using CPU\n");
+  }
+}
+
+void op_upload_dat(op_dat dat) {
+  if (!OP_hybrid_gpu)
+    return;
+  size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
+  if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+    char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
+    int element_size = dat->size / dat->dim;
+    for (int i = 0; i < dat->dim; i++) {
+      for (int j = 0; j < set_size; j++) {
+        for (int c = 0; c < element_size; c++) {
+          temp_data[element_size * i * set_size + element_size * j + c] =
+              dat->data[dat->size * j + element_size * i + c];
+        }
+      }
+    }
+    cutilSafeCall(hipMemcpy(dat->data_d, temp_data, set_size * dat->size,
+                             hipMemcpyHostToDevice));
+    free(temp_data);
+  } else {
+    cutilSafeCall(hipMemcpy(dat->data_d, dat->data, set_size * dat->size,
+                             hipMemcpyHostToDevice));
+  }
+}
+
+void op_download_dat(op_dat dat) {
+  if (!OP_hybrid_gpu)
+    return;
+  size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
+  if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+    char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
+    cutilSafeCall(hipMemcpy(temp_data, dat->data_d, set_size * dat->size,
+                             hipMemcpyDeviceToHost));
+    int element_size = dat->size / dat->dim;
+    for (int i = 0; i < dat->dim; i++) {
+      for (int j = 0; j < set_size; j++) {
+        for (int c = 0; c < element_size; c++) {
+          dat->data[dat->size * j + element_size * i + c] =
+              temp_data[element_size * i * set_size + element_size * j + c];
+        }
+      }
+    }
+    free(temp_data);
+  } else {
+    cutilSafeCall(hipMemcpy(dat->data, dat->data_d, set_size * dat->size,
+                             hipMemcpyDeviceToHost));
+  }
+}
+
+int op_mpi_halo_exchanges(op_set set, int nargs, op_arg *args) {
+  for (int n = 0; n < nargs; n++)
+    if (args[n].opt && args[n].argtype == OP_ARG_DAT &&
+        args[n].dat->dirty_hd == 2) {
+      op_download_dat(args[n].dat);
+      args[n].dat->dirty_hd = 0;
+    }
+  return set->size;
+}
+
+int op_mpi_halo_exchanges_grouped(op_set set, int nargs, op_arg *args, int device){
+  (void)device;
+  return device == 1 ? op_mpi_halo_exchanges(set, nargs, args) : op_mpi_halo_exchanges_cuda(set, nargs, args);
+}
+
+void op_mpi_set_dirtybit(int nargs, op_arg *args) {
+  for (int n = 0; n < nargs; n++) {
+    if ((args[n].opt == 1) && (args[n].argtype == OP_ARG_DAT) &&
+        (args[n].acc == OP_INC || args[n].acc == OP_WRITE ||
+         args[n].acc == OP_RW)) {
+      args[n].dat->dirty_hd = 1;
+    }
+  }
+}
+
+void op_mpi_wait_all(int nargs, op_arg *args) {
+  (void)nargs;
+  (void)args;
+}
+
+void op_mpi_wait_all_grouped(int nargs, op_arg *args, int device) {
+  (void)device;
+  (void)nargs;
+  (void)args;
+}
+
+void op_mpi_test_all(int nargs, op_arg *args) {
+  (void)nargs;
+  (void)args;
+}
+
+void op_mpi_test_all_grouped(int nargs, op_arg *args) {
+  (void)nargs;
+  (void)args;
+}
+
+int op_mpi_halo_exchanges_cuda(op_set set, int nargs, op_arg *args) {
+  for (int n = 0; n < nargs; n++)
+    if (args[n].opt && args[n].argtype == OP_ARG_DAT &&
+        args[n].dat->dirty_hd == 1) {
+      op_upload_dat(args[n].dat);
+      args[n].dat->dirty_hd = 0;
+    }
+  return set->size;
+}
+
+void op_mpi_set_dirtybit_cuda(int nargs, op_arg *args) {
+  for (int n = 0; n < nargs; n++) {
+    if ((args[n].opt == 1) && (args[n].argtype == OP_ARG_DAT) &&
+        (args[n].acc == OP_INC || args[n].acc == OP_WRITE ||
+         args[n].acc == OP_RW)) {
+      args[n].dat->dirty_hd = 2;
+    }
+  }
+}
+
+void op_mpi_wait_all_cuda(int nargs, op_arg *args) {
+  (void)nargs;
+  (void)args;
+}
+
+void op_mpi_reset_halos(int nargs, op_arg *args) {
+  (void)nargs;
+  (void)args;
+}
+
+void op_mpi_barrier() {}
+
+void *op_mpi_perf_time(const char *name, double time) {
+  (void)name;
+  (void)time;
+  return (void *)name;
+}
+
+#ifdef COMM_PERF
+void op_mpi_perf_comms(void *k_i, int nargs, op_arg *args) {
+  (void)k_i;
+  (void)nargs;
+  (void)args;
+}
+#endif
+
+void op_mpi_reduce_float(op_arg *args, float *data) {
+  (void)args;
+  (void)data;
+}
+
+void op_mpi_reduce_double(op_arg *args, double *data) {
+  (void)args;
+  (void)data;
+}
+
+void op_mpi_reduce_int(op_arg *args, int *data) {
+  (void)args;
+  (void)data;
+}
+
+void op_mpi_reduce_bool(op_arg *args, bool *data) {
+  (void)args;
+  (void)data;
+}
+
+void op_partition(const char *lib_name, const char *lib_routine,
+                  op_set prime_set, op_map prime_map, op_dat coords) {
+  (void)lib_name;
+  (void)lib_routine;
+  (void)prime_set;
+  (void)prime_map;
+  (void)coords;
+}
+
+void op_partition_ptr(const char *lib_name, const char *lib_routine,
+                      op_set prime_set, int *prime_map, double *coords) {
+  (void)lib_name;
+  (void)lib_routine;
+  (void)prime_set;
+  (void)prime_map;
+  (void)coords;
+}
+
+void op_partition_reverse() {}
+
+void op_compute_moment(double t, double *first, double *second) {
+  *first = t;
+  *second = t * t;
+}
+void op_compute_moment_across_times(double* times, int ntimes, bool ignore_zeros, double *first, double *second) {
+  *first = 0.0;
+  *second = 0.0f;
+  int n = 0;
+  for (int i=0; i<ntimes; i++) {
+    if (ignore_zeros && (times[i] == 0.0f)) {
+      continue;
+    }
+    *first += times[i];
+    *second += times[i] * times[i];
+    n++;
+  }
+
+  if (n != 0) {
+    *first /= (double)n;
+    *second /= (double)n;
+  }
+}
+
+int op_is_root() { return 1; }
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/op2/src/mpi/op_mpi_hip_decl.cpp b/op2/src/mpi/op_mpi_hip_decl.cpp
new file mode 100644
index 000000000..5841e370a
--- /dev/null
+++ b/op2/src/mpi/op_mpi_hip_decl.cpp
@@ -0,0 +1,749 @@
+/*
+ * Open source copyright declaration based on BSD open source template:
+ * http://www.opensource.org/licenses/bsd-license.php
+ *
+ * This file is part of the OP2 distribution.
+ *
+ * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in
+ * the main source directory for a full list of copyright holders.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of Mike Giles may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// This file implements the OP2 user-level functions for the HIP backend
+//
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <mpi.h>
+
+#include <op_hip_rt_support.h>
+#include <op_lib_core.h>
+#include <op_rt_support.h>
+
+#include <op_lib_c.h>
+#include <op_lib_mpi.h>
+#include <op_util.h>
+
+//
+// MPI Communicator for halo creation and exchange
+//
+
+MPI_Comm OP_MPI_WORLD;
+MPI_Comm OP_MPI_GLOBAL;
+
+//
+// HIP-specific OP2 functions
+//
+
+void op_init(int argc, char **argv, int diags) {
+  op_init_soa(argc, argv, diags, 0);
+}
+
+void op_init_soa(int argc, char **argv, int diags, int soa) {
+  int flag = 0;
+  OP_auto_soa = soa;
+  MPI_Initialized(&flag);
+  if (!flag) {
+    MPI_Init(&argc, &argv);
+  }
+  OP_MPI_WORLD = MPI_COMM_WORLD;
+  OP_MPI_GLOBAL = MPI_COMM_WORLD;
+  op_init_core(argc, argv, diags);
+
+  cutilDeviceInit(argc, argv);
+
+//
+// The following call is only made in the C version of OP2,
+// as it causes memory trashing when called from Fortran.
+// \warning add -DSET_CUDA_CACHE_CONFIG to compiling line
+// for this file when implementing C OP2.
+//
+  if (OP_hybrid_gpu) {
+#ifdef SET_CUDA_CACHE_CONFIG
+    cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared));
+#endif
+    cutilSafeCall(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte));
+    printf("\n 16/48 L1/shared \n");
+  }
+}
+
+void op_mpi_init(int argc, char **argv, int diags, MPI_Fint global,
+                 MPI_Fint local) {
+  op_mpi_init_soa(argc, argv, diags, global, local, 0);
+}
+
+void op_mpi_init_soa(int argc, char **argv, int diags, MPI_Fint global,
+                     MPI_Fint local, int soa) {
+  OP_auto_soa = soa;
+  int flag = 0;
+  MPI_Initialized(&flag);
+  if (!flag) {
+    printf("Error: MPI has to be initialized when calling op_mpi_init with "
+           "communicators\n");
+    exit(-1);
+  }
+  OP_MPI_WORLD = MPI_Comm_f2c(local);
+  OP_MPI_GLOBAL = MPI_Comm_f2c(global);
+  op_init_core(argc, argv, diags);
+
+  cutilDeviceInit(argc, argv);
+
+//
+// The following call is only made in the C version of OP2,
+// as it causes memory trashing when called from Fortran.
+// \warning add -DSET_CUDA_CACHE_CONFIG to compiling line
+// for this file when implementing C OP2.
+//
+
+#ifdef SET_CUDA_CACHE_CONFIG
+  cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared));
+#endif
+
+  //cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared));
+  cutilSafeCall(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte));
+  printf("\n 16/48 L1/shared \n");
+}
+
+op_dat op_decl_dat_char(op_set set, int dim, char const *type, int size,
+                        char *data, char const *name) {
+  if (set == NULL || data == NULL)
+    return NULL;
+  /*char *d = (char *)malloc((size_t)set->size * (size_t)dim * (size_t)size);
+  if (d == NULL && set->size>0) {
+    printf(" op_decl_dat_char error -- error allocating memory to dat\n");
+    exit(-1);
+  }
+
+  memcpy(d, data, set->size * dim * size * sizeof(char));
+  op_dat out_dat = op_decl_dat_core(set, dim, type, size, d, name);*/
+  op_dat out_dat = op_decl_dat_core(set, dim, type, size, data, name);
+
+  op_dat_entry *item;
+  op_dat_entry *tmp_item;
+  for (item = TAILQ_FIRST(&OP_dat_list); item != NULL; item = tmp_item) {
+    tmp_item = TAILQ_NEXT(item, entries);
+    if (item->dat == out_dat) {
+      item->orig_ptr = data;
+      break;
+    }
+  }
+  out_dat->user_managed = 0;
+  return out_dat;
+}
+
+op_dat op_decl_dat_temp_char(op_set set, int dim, char const *type, int size,
+                             char const *name) {
+  char *data = NULL;
+  op_dat dat = op_decl_dat_temp_core(set, dim, type, size, data, name);
+
+  // create empty data block to assign to this temporary dat (including the
+  // halos)
+  int set_size = set->size + OP_import_exec_list[set->index]->size +
+                 OP_import_nonexec_list[set->index]->size;
+
+  // initialize data bits to 0
+  for (size_t i = 0; i < set_size * dim * size; i++)
+    dat->data[i] = 0;
+  dat->user_managed = 0;
+
+  // transpose
+  if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+    cutilSafeCall(
+        hipMalloc((void **)&(dat->buffer_d_r),
+                   (size_t)dat->size * (OP_import_exec_list[set->index]->size +
+                                OP_import_nonexec_list[set->index]->size)));
+  }
+
+  op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data),
+                    (size_t)dat->size * set_size);
+
+  // need to allocate mpi_buffers for this new temp_dat
+  op_mpi_buffer mpi_buf = (op_mpi_buffer)xmalloc(sizeof(op_mpi_buffer_core));
+
+  halo_list exec_e_list = OP_export_exec_list[set->index];
+  halo_list nonexec_e_list = OP_export_nonexec_list[set->index];
+
+  mpi_buf->buf_exec = (char *)xmalloc((exec_e_list->size) * (size_t)dat->size);
+  mpi_buf->buf_nonexec = (char *)xmalloc((nonexec_e_list->size) * (size_t)dat->size);
+
+  halo_list exec_i_list = OP_import_exec_list[set->index];
+  halo_list nonexec_i_list = OP_import_nonexec_list[set->index];
+
+  mpi_buf->s_req = (MPI_Request *)xmalloc(
+      sizeof(MPI_Request) *
+      (exec_e_list->ranks_size + nonexec_e_list->ranks_size));
+  mpi_buf->r_req = (MPI_Request *)xmalloc(
+      sizeof(MPI_Request) *
+      (exec_i_list->ranks_size + nonexec_i_list->ranks_size));
+
+  mpi_buf->s_num_req = 0;
+  mpi_buf->r_num_req = 0;
+
+  dat->mpi_buffer = mpi_buf;
+
+  // need to allocate device buffers for mpi comms for this new temp_dat
+  cutilSafeCall(
+      hipMalloc((void **)&(dat->buffer_d),
+                 (size_t)dat->size * (OP_export_exec_list[set->index]->size +
+                              OP_export_nonexec_list[set->index]->size)));
+
+  return dat;
+}
+
+int op_free_dat_temp_char(op_dat dat) {
+  // need to free mpi_buffers use in this op_dat
+  free(((op_mpi_buffer)(dat->mpi_buffer))->buf_exec);
+  free(((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec);
+  free(((op_mpi_buffer)(dat->mpi_buffer))->s_req);
+  free(((op_mpi_buffer)(dat->mpi_buffer))->r_req);
+  free(dat->mpi_buffer);
+
+  // need to free device buffers used in mpi comms
+  cutilSafeCall(hipFree(dat->buffer_d));
+
+  if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+    cutilSafeCall(hipFree(dat->buffer_d_r));
+  }
+
+  // free data on device
+  cutilSafeCall(hipFree(dat->data_d));
+  return op_free_dat_temp_core(dat);
+}
+
+void op_mv_halo_device(op_set set, op_dat dat) {
+  int set_size = set->size + OP_import_exec_list[set->index]->size +
+                 OP_import_nonexec_list[set->index]->size;
+  if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+    char *temp_data = (char *)malloc((size_t)dat->size * set_size * sizeof(char));
+    size_t element_size = (size_t)dat->size / dat->dim;
+    for (size_t i = 0; i < dat->dim; i++) {
+      for (size_t j = 0; j < set_size; j++) {
+        for (size_t c = 0; c < element_size; c++) {
+          temp_data[element_size * i * set_size + element_size * j + c] =
+              dat->data[(size_t)dat->size * j + element_size * i + c];
+        }
+      }
+    }
+    op_cpHostToDevice((void **)&(dat->data_d), (void **)&(temp_data),
+                      (size_t)dat->size * set_size);
+    free(temp_data);
+
+    if (dat->buffer_d_r != NULL) cutilSafeCall(hipFree(dat->buffer_d_r));
+    cutilSafeCall(
+        hipMalloc((void **)&(dat->buffer_d_r),
+                   (size_t)dat->size * (OP_import_exec_list[set->index]->size +
+                                OP_import_nonexec_list[set->index]->size)));
+
+  } else {
+    op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data),
+                      (size_t)dat->size * set_size);
+  }
+  dat->dirty_hd = 0;
+  if (dat->buffer_d != NULL) cutilSafeCall(hipFree(dat->buffer_d));
+  cutilSafeCall(
+      hipMalloc((void **)&(dat->buffer_d),
+                 (size_t)dat->size * (OP_export_exec_list[set->index]->size +
+                              OP_export_nonexec_list[set->index]->size +
+                              set_import_buffer_size[set->index])));
+}
+
+void op_mv_halo_list_device() {
+  if (export_exec_list_d != NULL) {
+    for (int s = 0; s < OP_set_index; s++)
+      if (export_exec_list_d[OP_set_list[s]->index] != NULL)
+        cutilSafeCall(hipFree(export_exec_list_d[OP_set_list[s]->index]));
+    free(export_exec_list_d);
+  }
+  export_exec_list_d = (int **)xmalloc(sizeof(int *) * OP_set_index);
+
+  for (int s = 0; s < OP_set_index; s++) { // for each set
+    op_set set = OP_set_list[s];
+    export_exec_list_d[set->index] = NULL;
+
+    op_cpHostToDevice((void **)&(export_exec_list_d[set->index]),
+                      (void **)&(OP_export_exec_list[set->index]->list),
+                      OP_export_exec_list[set->index]->size * sizeof(int));
+  }
+
+  if (export_nonexec_list_d != NULL) {
+    for (int s = 0; s < OP_set_index; s++)
+      if (export_nonexec_list_d[OP_set_list[s]->index] != NULL)
+        cutilSafeCall(hipFree(export_nonexec_list_d[OP_set_list[s]->index]));
+    free(export_nonexec_list_d);
+  }
+  export_nonexec_list_d = (int **)xmalloc(sizeof(int *) * OP_set_index);
+
+  for (int s = 0; s < OP_set_index; s++) { // for each set
+    op_set set = OP_set_list[s];
+    export_nonexec_list_d[set->index] = NULL;
+
+    op_cpHostToDevice((void **)&(export_nonexec_list_d[set->index]),
+                      (void **)&(OP_export_nonexec_list[set->index]->list),
+                      OP_export_nonexec_list[set->index]->size * sizeof(int));
+  }
+
+  //for grouped, we need the disps array on device too
+  if (export_exec_list_disps_d != NULL) {
+    for (int s = 0; s < OP_set_index; s++)
+      if (export_exec_list_disps_d[OP_set_list[s]->index] != NULL)
+        cutilSafeCall(hipFree(export_exec_list_disps_d[OP_set_list[s]->index]));
+    free(export_exec_list_disps_d);
+  }
+  export_exec_list_disps_d = (int **)xmalloc(sizeof(int *) * OP_set_index);
+
+  for (int s = 0; s < OP_set_index; s++) { // for each set
+    op_set set = OP_set_list[s];
+    export_exec_list_disps_d[set->index] = NULL;
+
+    //make sure end size is there too
+    OP_export_exec_list[set->index]
+        ->disps[OP_export_exec_list[set->index]->ranks_size] =
+        OP_export_exec_list[set->index]->ranks_size == 0
+            ? 0
+            : OP_export_exec_list[set->index]
+                      ->disps[OP_export_exec_list[set->index]->ranks_size - 1] +
+                  OP_export_exec_list[set->index]
+                      ->sizes[OP_export_exec_list[set->index]->ranks_size - 1];
+    op_cpHostToDevice((void **)&(export_exec_list_disps_d[set->index]),
+                      (void **)&(OP_export_exec_list[set->index]->disps),
+                      (OP_export_exec_list[set->index]->ranks_size+1) * sizeof(int));
+  }
+
+  if (export_nonexec_list_disps_d != NULL) {
+    for (int s = 0; s < OP_set_index; s++)
+      if (export_nonexec_list_disps_d[OP_set_list[s]->index] != NULL)
+        cutilSafeCall(hipFree(export_nonexec_list_disps_d[OP_set_list[s]->index]));
+    free(export_nonexec_list_disps_d);
+  }
+  export_nonexec_list_disps_d = (int **)xmalloc(sizeof(int *) * OP_set_index);
+
+  for (int s = 0; s < OP_set_index; s++) { // for each set
+    op_set set = OP_set_list[s];
+    export_nonexec_list_disps_d[set->index] = NULL;
+
+    //make sure end size is there too
+    OP_export_nonexec_list[set->index]
+        ->disps[OP_export_nonexec_list[set->index]->ranks_size] =
+        OP_export_nonexec_list[set->index]->ranks_size == 0
+            ? 0
+            : OP_export_nonexec_list[set->index]
+                      ->disps[OP_export_nonexec_list[set->index]->ranks_size -
+                              1] +
+                  OP_export_nonexec_list[set->index]
+                      ->sizes[OP_export_nonexec_list[set->index]->ranks_size -
+                              1];
+    op_cpHostToDevice((void **)&(export_nonexec_list_disps_d[set->index]),
+                      (void **)&(OP_export_nonexec_list[set->index]->disps),
+                      (OP_export_nonexec_list[set->index]->ranks_size+1) * sizeof(int));
+  }
+  if (import_exec_list_disps_d != NULL) {
+    for (int s = 0; s < OP_set_index; s++)
+      if (import_exec_list_disps_d[OP_set_list[s]->index] != NULL)
+        cutilSafeCall(hipFree(import_exec_list_disps_d[OP_set_list[s]->index]));
+    free(import_exec_list_disps_d);
+  }
+  import_exec_list_disps_d = (int **)xmalloc(sizeof(int *) * OP_set_index);
+
+  for (int s = 0; s < OP_set_index; s++) { // for each set
+    op_set set = OP_set_list[s];
+    import_exec_list_disps_d[set->index] = NULL;
+
+    //make sure end size is there too
+    OP_import_exec_list[set->index]
+        ->disps[OP_import_exec_list[set->index]->ranks_size] =
+        OP_import_exec_list[set->index]->ranks_size == 0
+            ? 0
+            : OP_import_exec_list[set->index]
+                      ->disps[OP_import_exec_list[set->index]->ranks_size - 1] +
+                  OP_import_exec_list[set->index]
+                      ->sizes[OP_import_exec_list[set->index]->ranks_size - 1];
+    op_cpHostToDevice((void **)&(import_exec_list_disps_d[set->index]),
+                      (void **)&(OP_import_exec_list[set->index]->disps),
+                      (OP_import_exec_list[set->index]->ranks_size+1) * sizeof(int));
+  }
+
+  if (import_nonexec_list_disps_d != NULL) {
+    for (int s = 0; s < OP_set_index; s++)
+      if (import_nonexec_list_disps_d[OP_set_list[s]->index] != NULL)
+        cutilSafeCall(hipFree(import_nonexec_list_disps_d[OP_set_list[s]->index]));
+    free(import_nonexec_list_disps_d);
+  }
+  import_nonexec_list_disps_d = (int **)xmalloc(sizeof(int *) * OP_set_index);
+
+  for (int s = 0; s < OP_set_index; s++) { // for each set
+    op_set set = OP_set_list[s];
+    import_nonexec_list_disps_d[set->index] = NULL;
+
+    //make sure end size is there too
+    OP_import_nonexec_list[set->index]
+        ->disps[OP_import_nonexec_list[set->index]->ranks_size] =
+        OP_import_nonexec_list[set->index]->ranks_size == 0
+            ? 0
+            : OP_import_nonexec_list[set->index]
+                      ->disps[OP_import_nonexec_list[set->index]->ranks_size -
+                              1] +
+                  OP_import_nonexec_list[set->index]
+                      ->sizes[OP_import_nonexec_list[set->index]->ranks_size -
+                              1];
+    op_cpHostToDevice((void **)&(import_nonexec_list_disps_d[set->index]),
+                      (void **)&(OP_import_nonexec_list[set->index]->disps),
+                      (OP_import_nonexec_list[set->index]->ranks_size+1) * sizeof(int));
+  }
+
+  if ( export_nonexec_list_partial_d!= NULL) {
+    for (int s = 0; s < OP_map_index; s++)
+      if (OP_map_partial_exchange[s] && export_nonexec_list_partial_d[OP_map_list[s]->index] != NULL)
+        cutilSafeCall(hipFree(export_nonexec_list_partial_d[OP_map_list[s]->index]));
+    free(export_nonexec_list_partial_d);
+  }
+  export_nonexec_list_partial_d = (int **)calloc(sizeof(int *) * OP_map_index,1);
+
+  for (int s = 0; s < OP_map_index; s++) { // for each set
+    if (!OP_map_partial_exchange[s])
+      continue;
+    op_map map = OP_map_list[s];
+    export_nonexec_list_partial_d[map->index] = NULL;
+
+    op_cpHostToDevice((void **)&(export_nonexec_list_partial_d[map->index]),
+                      (void **)&(OP_export_nonexec_permap[map->index]->list),
+                      OP_export_nonexec_permap[map->index]->size * sizeof(int));
+  }
+
+  if ( import_nonexec_list_partial_d!= NULL) {
+    for (int s = 0; s < OP_map_index; s++)
+      if (OP_map_partial_exchange[s] && import_nonexec_list_partial_d[OP_map_list[s]->index] != NULL)
+        cutilSafeCall(hipFree(import_nonexec_list_partial_d[OP_map_list[s]->index]));
+    free(import_nonexec_list_partial_d);
+  }
+  import_nonexec_list_partial_d = (int **)calloc(sizeof(int *) * OP_map_index,1);
+
+  for (int s = 0; s < OP_map_index; s++) { // for each set
+    if (!OP_map_partial_exchange[s])
+      continue;
+    op_map map = OP_map_list[s];
+    import_nonexec_list_partial_d[map->index] = NULL;
+
+    op_cpHostToDevice((void **)&(import_nonexec_list_partial_d[map->index]),
+                      (void **)&(OP_import_nonexec_permap[map->index]->list),
+                      OP_import_nonexec_permap[map->index]->size * sizeof(int));
+  }
+}
+
+op_set op_decl_set(int size, char const *name) {
+  return op_decl_set_core(size, name);
+}
+
+op_map op_decl_map(op_set from, op_set to, int dim, int *imap,
+                   char const *name) {
+  // int *m = (int *)xmalloc(from->size * dim * sizeof(int));
+  //  memcpy(m, imap, from->size * dim * sizeof(int));
+  op_map out_map = op_decl_map_core(from, to, dim, imap, name);
+  out_map->user_managed = 0;
+  return out_map;
+  // return op_decl_map_core ( from, to, dim, imap, name );
+}
+
+op_arg op_arg_dat(op_dat dat, int idx, op_map map, int dim, char const *type,
+                  op_access acc) {
+  return op_arg_dat_core(dat, idx, map, dim, type, acc);
+}
+
+op_arg op_opt_arg_dat(int opt, op_dat dat, int idx, op_map map, int dim,
+                      char const *type, op_access acc) {
+  return op_opt_arg_dat_core(opt, dat, idx, map, dim, type, acc);
+}
+
+op_arg op_arg_gbl_char(char *data, int dim, const char *type, int size,
+                       op_access acc) {
+  return op_arg_gbl_core(1, data, dim, type, size, acc);
+}
+
+op_arg op_opt_arg_gbl_char(int opt, char *data, int dim, const char *type,
+                           int size, op_access acc) {
+  return op_arg_gbl_core(opt, data, dim, type, size, acc);
+}
+
+void op_printf(const char *format, ...) {
+  int my_rank;
+  MPI_Comm_rank(OP_MPI_WORLD, &my_rank);
+  if (my_rank == MPI_ROOT) {
+    va_list argptr;
+    va_start(argptr, format);
+    vprintf(format, argptr);
+    va_end(argptr);
+  }
+}
+
+void op_print(const char *line) {
+  int my_rank;
+  MPI_Comm_rank(OP_MPI_WORLD, &my_rank);
+  if (my_rank == MPI_ROOT) {
+    printf("%s\n", line);
+  }
+}
+
+void op_timers(double *cpu, double *et) {
+  MPI_Barrier(OP_MPI_WORLD);
+  op_timers_core(cpu, et);
+}
+
+//
+// This function is defined in the generated master kernel file
+// so that it is possible to check on the runtime size of the
+// data in cases where it is not known at compile time
+//
+
+/*
+void
+op_decl_const_char ( int dim, char const * type, int size, char * dat,
+                     char const * name )
+{
+  cutilSafeCall ( hipMemcpyToSymbol(HIP_SYMBOL(name), dat, dim * size, 0,
+                                       hipMemcpyHostToDevice ) );
+}
+*/
+
+void op_exit() {
+  // need to free buffer_d used for mpi comms in each op_dat
+  if (OP_hybrid_gpu) {
+    op_dat_entry *item;
+    TAILQ_FOREACH(item, &OP_dat_list, entries) {
+      if (strstr(item->dat->type, ":soa") != NULL ||
+          (OP_auto_soa && item->dat->dim > 1)) {
+        cutilSafeCall(hipFree((item->dat)->buffer_d_r));
+      }
+      cutilSafeCall(hipFree((item->dat)->buffer_d));
+    }
+
+    for (int i = 0; i < OP_set_index; i++) {
+      if (export_exec_list_d[i] != NULL)
+        cutilSafeCall(hipFree(export_exec_list_d[i]));
+      if (export_nonexec_list_d[i] != NULL)
+        cutilSafeCall(hipFree(export_nonexec_list_d[i]));
+    }
+    for (int i = 0; i < OP_map_index; i++) {
+      if (!OP_map_partial_exchange[i])
+        continue;
+      cutilSafeCall(hipFree(export_nonexec_list_partial_d[i]));
+      cutilSafeCall(hipFree(import_nonexec_list_partial_d[i]));
+    }
+  }
+
+  op_mpi_exit();
+  op_cuda_exit(); // frees dat_d memory
+  op_rt_exit();   // frees plan memory
+  op_exit_core(); // frees lib core variables
+
+  int flag = 0;
+  MPI_Finalized(&flag);
+  if (!flag)
+    MPI_Finalize();
+}
+
+void op_timing_output() {
+  op_timing_output_core();
+  printf("Total plan time: %8.4f\n", OP_plan_time);
+}
+
+void op_timings_to_csv(const char *outputFileName) {
+  int comm_size, comm_rank;
+  MPI_Comm_size(OP_MPI_WORLD, &comm_size);
+  MPI_Comm_rank(OP_MPI_WORLD, &comm_rank);
+
+  FILE * outputFile = NULL;
+  if (op_is_root()) {
+    outputFile = fopen(outputFileName, "w");
+    if (outputFile == NULL) {
+      printf("ERROR: Failed to open file for writing: '%s'\n", outputFileName);
+    }
+    else {
+      fprintf(outputFile, "rank,thread,nranks,nthreads,count,total time,plan time,mpi time,GB used,GB total,kernel name\n");
+    }
+  }
+
+  bool can_write = (outputFile != NULL);
+  MPI_Bcast(&can_write, 1, MPI_INT, MPI_ROOT, OP_MPI_WORLD);
+
+  if (can_write) {
+    for (int n = 0; n < OP_kern_max; n++) {
+      if (OP_kernels[n].count > 0) {
+        if (OP_kernels[n].ntimes == 1 && OP_kernels[n].times[0] == 0.0f &&
+            OP_kernels[n].time != 0.0f) {
+          // This library is being used by an OP2 translation made with the
+          // older
+          // translator with older timing logic. Adjust to new logic:
+          OP_kernels[n].times[0] = OP_kernels[n].time;
+        }
+
+        if (op_is_root()) {
+          double times[OP_kernels[n].ntimes*comm_size];
+          for (int i=0; i<(OP_kernels[n].ntimes*comm_size); i++) times[i] = 0.0f;
+          MPI_Gather(OP_kernels[n].times, OP_kernels[n].ntimes, MPI_DOUBLE, times, OP_kernels[n].ntimes, MPI_DOUBLE, MPI_ROOT, OP_MPI_WORLD);
+
+          float plan_times[comm_size];
+          for (int i=0; i<comm_size; i++) plan_times[i] = 0.0f;
+          MPI_Gather(&(OP_kernels[n].plan_time), 1, MPI_FLOAT, plan_times, 1, MPI_FLOAT, MPI_ROOT, OP_MPI_WORLD);
+
+          double mpi_times[comm_size];
+          for (int i=0; i<comm_size; i++) mpi_times[i] = 0.0f;
+          MPI_Gather(&(OP_kernels[n].mpi_time), 1, MPI_DOUBLE, mpi_times, 1, MPI_DOUBLE, MPI_ROOT, OP_MPI_WORLD);
+
+          float transfers[comm_size];
+          for (int i=0; i<comm_size; i++) transfers[i] = 0.0f;
+          MPI_Gather(&(OP_kernels[n].transfer), 1, MPI_FLOAT, transfers, 1, MPI_FLOAT, MPI_ROOT, OP_MPI_WORLD);
+
+          float transfers2[comm_size];
+          for (int i=0; i<comm_size; i++) transfers2[i] = 0.0f;
+          MPI_Gather(&(OP_kernels[n].transfer2), 1, MPI_FLOAT, transfers2, 1, MPI_FLOAT, MPI_ROOT, OP_MPI_WORLD);
+
+          // Have data, now write:
+          for (int p=0 ; p<comm_size ; p++) {
+            for (int thr=0; thr<OP_kernels[n].ntimes; thr++) {
+              double kern_time = times[p*OP_kernels[n].ntimes + thr];
+
+              fprintf(outputFile, "%d,%d,%d,%d,%d,%f,%f,%f,%f,%f,%s\n", p, thr,
+                      comm_size, OP_kernels[n].ntimes, OP_kernels[n].count,
+                      kern_time, plan_times[p], mpi_times[p],
+                      transfers[p] / 1e9f, transfers2[p] / 1e9f,
+                      OP_kernels[n].name);
+            }
+          }
+        }
+        else {
+          MPI_Gather(OP_kernels[n].times, OP_kernels[n].ntimes, MPI_DOUBLE, NULL, 0, MPI_DOUBLE, MPI_ROOT, OP_MPI_WORLD);
+
+          MPI_Gather(&(OP_kernels[n].plan_time), 1, MPI_FLOAT, NULL, 0, MPI_FLOAT, MPI_ROOT, OP_MPI_WORLD);
+
+          MPI_Gather(&(OP_kernels[n].mpi_time), 1, MPI_DOUBLE, NULL, 0, MPI_DOUBLE, MPI_ROOT, OP_MPI_WORLD);
+
+          MPI_Gather(&(OP_kernels[n].transfer), 1, MPI_FLOAT, NULL, 0, MPI_FLOAT, MPI_ROOT, OP_MPI_WORLD);
+
+          MPI_Gather(&(OP_kernels[n].transfer2), 1, MPI_FLOAT, NULL, 0, MPI_FLOAT, MPI_ROOT, OP_MPI_WORLD);
+        }
+
+        op_mpi_barrier();
+      }
+    }
+  }
+
+  if (op_is_root() && outputFile != NULL) {
+    fclose(outputFile);
+  }
+}
+
+void op_print_dat_to_binfile(op_dat dat, const char *file_name) {
+  // need to get data from GPU
+  op_cuda_get_data(dat);
+
+  // rearrange data backe to original order in mpi
+  op_dat temp = op_mpi_get_data(dat);
+  print_dat_to_binfile_mpi(temp, file_name);
+
+  free(temp->data);
+  free(temp->set);
+  free(temp);
+}
+
+void op_print_dat_to_txtfile(op_dat dat, const char *file_name) {
+  // need to get data from GPU
+  op_cuda_get_data(dat);
+
+  // rearrange data backe to original order in mpi
+  op_dat temp = op_mpi_get_data(dat);
+  print_dat_to_txtfile_mpi(temp, file_name);
+
+  free(temp->data);
+  free(temp->set);
+  free(temp);
+}
+
+void op_upload_all() {
+  op_dat_entry *item;
+  TAILQ_FOREACH(item, &OP_dat_list, entries) {
+    op_dat dat = item->dat;
+    int set_size = dat->set->size + OP_import_exec_list[dat->set->index]->size +
+                   OP_import_nonexec_list[dat->set->index]->size;
+    if (dat->data_d) {
+      if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+        char *temp_data = (char *)malloc((size_t)dat->size * set_size * sizeof(char));
+        size_t element_size = (size_t)dat->size / dat->dim;
+        for (size_t i = 0; i < dat->dim; i++) {
+          for (size_t j = 0; j < set_size; j++) {
+            for (size_t c = 0; c < element_size; c++) {
+              temp_data[element_size * i * set_size + element_size * j + c] =
+                  dat->data[(size_t)dat->size * j + element_size * i + c];
+            }
+          }
+        }
+        cutilSafeCall(hipMemcpy(dat->data_d, temp_data, (size_t)dat->size * set_size,
+                                 hipMemcpyHostToDevice));
+        dat->dirty_hd = 0;
+        free(temp_data);
+      } else {
+        cutilSafeCall(hipMemcpy(dat->data_d, dat->data, (size_t)dat->size * set_size,
+                                 hipMemcpyHostToDevice));
+        dat->dirty_hd = 0;
+      }
+    }
+  }
+}
+
+void op_fetch_data_char(op_dat dat, char *usr_ptr) {
+  // need to get data from GPU
+  op_cuda_get_data(dat);
+
+  // rearrange data backe to original order in mpi
+  op_dat temp = op_mpi_get_data(dat);
+
+  // copy data into usr_ptr
+  memcpy((void *)usr_ptr, (void *)temp->data, temp->set->size * temp->size);
+  free(temp->data);
+  free(temp->set);
+  free(temp);
+}
+
+op_dat op_fetch_data_file_char(op_dat dat) {
+  // need to get data from GPU
+  op_cuda_get_data(dat);
+  // rearrange data backe to original order in mpi
+  return op_mpi_get_data(dat);
+}
+
+void op_fetch_data_idx_char(op_dat dat, char *usr_ptr, int low, int high) {
+  // need to get data from GPU
+  op_cuda_get_data(dat);
+
+  // rearrange data backe to original order in mpi
+  op_dat temp = op_mpi_get_data(dat);
+
+  // do allgather on temp->data and copy it to memory block pointed to by
+  // use_ptr
+  fetch_data_hdf5(temp, usr_ptr, low, high);
+
+  free(temp->data);
+  free(temp->set);
+  free(temp);
+}
diff --git a/op2/src/mpi/op_mpi_hip_kernels.cpp b/op2/src/mpi/op_mpi_hip_kernels.cpp
new file mode 100644
index 000000000..e9c7cd661
--- /dev/null
+++ b/op2/src/mpi/op_mpi_hip_kernels.cpp
@@ -0,0 +1,453 @@
+#include "hip/hip_runtime.h"
+/*
+ * Open source copyright declaration based on BSD open source template:
+ * http://www.opensource.org/licenses/bsd-license.php
+ *
+ * This file is part of the OP2 distribution.
+ *
+ * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in
+ * the main source directory for a full list of copyright holders.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of Mike Giles may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define OP_MPI_CORE_NOMPI
+#include <op_lib_mpi.h>
+
+#include <op_lib_c.h>
+#include <op_hip_rt_support.h>
+#include <vector>
+#include <algorithm>
+
+__global__ void export_halo_gather(int *list, char *dat, int copy_size,
+                                   int elem_size, char *export_buffer) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id < copy_size) {
+    int off = 0;
+    if (elem_size % 16 == 0) {
+      off += 16 * (elem_size / 16);
+      for (int i = 0; i < elem_size / 16; i++) {
+        ((double2 *)(export_buffer + id * elem_size))[i] =
+            ((double2 *)(dat + list[id] * elem_size))[i];
+      }
+    } else if (elem_size % 8 == 0) {
+      off += 8 * (elem_size / 8);
+      for (int i = 0; i < elem_size / 8; i++) {
+        ((double *)(export_buffer + id * elem_size))[i] =
+            ((double *)(dat + list[id] * elem_size))[i];
+      }
+    }
+    for (int i = off; i < elem_size; i++) {
+      export_buffer[id * elem_size + i] = dat[list[id] * elem_size + i];
+    }
+  }
+}
+
+__global__ void export_halo_gather_soa(int *list, char *dat, int copy_size,
+                                       int elem_size, char *export_buffer,
+                                       int set_size, int dim) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int size_of = elem_size / dim;
+  if (id < copy_size) {
+    if (size_of == 8) {
+      for (int i = 0; i < dim; i++) {
+        ((double *)(export_buffer + id * elem_size))[i] =
+            ((double *)(dat + list[id] * size_of))[i * set_size];
+      }
+    } else {
+      for (int i = 0; i < dim; i++) {
+        for (int j = 0; j < size_of; j++) {
+          export_buffer[id * elem_size + i * size_of + j] =
+              dat[list[id] * size_of + i * set_size * size_of + j];
+        }
+      }
+    }
+  }
+}
+
+__global__ void import_halo_scatter_soa(int offset, char *dat, int copy_size,
+                                        int elem_size, char *import_buffer,
+                                        int set_size, int dim) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int size_of = elem_size / dim;
+  if (id < copy_size) {
+    if (size_of == 8) {
+      for (int i = 0; i < dim; i++) {
+        ((double *)(dat + (offset + id) * size_of))[i * set_size] =
+            ((double *)(import_buffer + id * elem_size))[i];
+      }
+    } else {
+      for (int i = 0; i < dim; i++) {
+        for (int j = 0; j < size_of; j++) {
+          dat[(offset + id) * size_of + i * set_size * size_of + j] =
+              import_buffer[id * elem_size + i * size_of + j];
+        }
+      }
+    }
+  }
+}
+
+__global__ void import_halo_scatter_partial_soa(int *list, char *dat,
+                                                int copy_size, int elem_size,
+                                                char *import_buffer,
+                                                int set_size, int dim) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int size_of = elem_size / dim;
+  if (id < copy_size) {
+    int element = list[id];
+    if (size_of == 8) {
+      for (int i = 0; i < dim; i++) {
+        ((double *)(dat + (element)*size_of))[i * set_size] =
+            ((double *)(import_buffer + id * elem_size))[i];
+      }
+    } else {
+      for (int i = 0; i < dim; i++) {
+        for (int j = 0; j < size_of; j++) {
+          dat[(element)*size_of + i * set_size * size_of + j] =
+              import_buffer[id * elem_size + i * size_of + j];
+        }
+      }
+    }
+  }
+}
+
+__global__ void import_halo_scatter_partial(int *list, char *dat, int copy_size,
+                                            int elem_size, char *import_buffer,
+                                            int dim) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int size_of = elem_size / dim;
+  if (id < copy_size) {
+    int element = list[id];
+    if (size_of == 8) {
+      for (int i = 0; i < dim; i++) {
+        ((double *)(dat + element * elem_size))[i] =
+            ((double *)(import_buffer + id * elem_size))[i];
+      }
+    } else {
+      for (int i = 0; i < dim; i++) {
+        for (int j = 0; j < size_of; j++) {
+          dat[element * elem_size + i * size_of + j] =
+              import_buffer[id * elem_size + i * size_of + j];
+        }
+      }
+    }
+  }
+}
+
+void gather_data_to_buffer(op_arg arg, halo_list exp_exec_list,
+                           halo_list exp_nonexec_list) {
+  int threads = 192;
+  int blocks = 1 + ((exp_exec_list->size - 1) / 192);
+
+  if (strstr(arg.dat->type, ":soa") != NULL ||
+      (OP_auto_soa && arg.dat->dim > 1)) {
+
+    int set_size = arg.dat->set->size + arg.dat->set->exec_size +
+                   arg.dat->set->nonexec_size;
+
+    hipLaunchKernelGGL(export_halo_gather_soa, blocks, threads, 0, 0, 
+        export_exec_list_d[arg.dat->set->index], arg.data_d,
+        exp_exec_list->size, arg.dat->size, arg.dat->buffer_d, set_size,
+        arg.dat->dim);
+
+    int blocks2 = 1 + ((exp_nonexec_list->size - 1) / 192);
+    hipLaunchKernelGGL(export_halo_gather_soa, blocks2, threads, 0, 0, 
+        export_nonexec_list_d[arg.dat->set->index], arg.data_d,
+        exp_nonexec_list->size, arg.dat->size,
+        arg.dat->buffer_d + exp_exec_list->size * arg.dat->size, set_size,
+        arg.dat->dim);
+  } else {
+    hipLaunchKernelGGL(export_halo_gather, blocks, threads, 0, 0, 
+        export_exec_list_d[arg.dat->set->index], arg.data_d,
+        exp_exec_list->size, arg.dat->size, arg.dat->buffer_d);
+
+    int blocks2 = 1 + ((exp_nonexec_list->size - 1) / 192);
+    hipLaunchKernelGGL(export_halo_gather, blocks2, threads, 0, 0, 
+        export_nonexec_list_d[arg.dat->set->index], arg.data_d,
+        exp_nonexec_list->size, arg.dat->size,
+        arg.dat->buffer_d + exp_exec_list->size * arg.dat->size);
+  }
+}
+
+void gather_data_to_buffer_partial(op_arg arg, halo_list exp_nonexec_list) {
+  int threads = 192;
+  int blocks = 1 + ((exp_nonexec_list->size - 1) / 192);
+
+  if (strstr(arg.dat->type, ":soa") != NULL ||
+      (OP_auto_soa && arg.dat->dim > 1)) {
+
+    int set_size = arg.dat->set->size + arg.dat->set->exec_size +
+                   arg.dat->set->nonexec_size;
+
+    hipLaunchKernelGGL(export_halo_gather_soa, blocks, threads, 0, 0, 
+        export_nonexec_list_partial_d[arg.map->index], arg.data_d,
+        exp_nonexec_list->size, arg.dat->size, arg.dat->buffer_d, set_size,
+        arg.dat->dim);
+  } else {
+    hipLaunchKernelGGL(export_halo_gather, blocks, threads, 0, 0, 
+        export_nonexec_list_partial_d[arg.map->index], arg.data_d,
+        exp_nonexec_list->size, arg.dat->size, arg.dat->buffer_d);
+  }
+}
+
+void scatter_data_from_buffer(op_arg arg) {
+  int threads = 192;
+  int blocks = 1 + ((arg.dat->set->exec_size - 1) / 192);
+
+  if (strstr(arg.dat->type, ":soa") != NULL ||
+      (OP_auto_soa && arg.dat->dim > 1)) {
+
+    int set_size = arg.dat->set->size + arg.dat->set->exec_size +
+                   arg.dat->set->nonexec_size;
+    int offset = arg.dat->set->size;
+    int copy_size = arg.dat->set->exec_size;
+
+    hipLaunchKernelGGL(import_halo_scatter_soa, blocks, threads, 0, 0, 
+        offset, arg.data_d, copy_size, arg.dat->size, arg.dat->buffer_d_r,
+        set_size, arg.dat->dim);
+
+    offset += arg.dat->set->exec_size;
+    copy_size = arg.dat->set->nonexec_size;
+
+    int blocks2 = 1 + ((arg.dat->set->nonexec_size - 1) / 192);
+    hipLaunchKernelGGL(import_halo_scatter_soa, blocks2, threads, 0, 0, 
+        offset, arg.data_d, copy_size, arg.dat->size,
+        arg.dat->buffer_d_r + arg.dat->set->exec_size * arg.dat->size, set_size,
+        arg.dat->dim);
+  }
+}
+
+void scatter_data_from_buffer_partial(op_arg arg) {
+  int threads = 192;
+  int blocks = 1 + ((OP_import_nonexec_permap[arg.map->index]->size - 1) / 192);
+
+  if (strstr(arg.dat->type, ":soa") != NULL ||
+      (OP_auto_soa && arg.dat->dim > 1)) {
+
+    int set_size = arg.dat->set->size + arg.dat->set->exec_size +
+                   arg.dat->set->nonexec_size;
+    int init = OP_export_nonexec_permap[arg.map->index]->size;
+    int copy_size = OP_import_nonexec_permap[arg.map->index]->size;
+
+    hipLaunchKernelGGL(import_halo_scatter_partial_soa, blocks, threads, 0, 0, 
+        import_nonexec_list_partial_d[arg.map->index], arg.data_d, copy_size,
+        arg.dat->size, arg.dat->buffer_d + init * arg.dat->size, set_size,
+        arg.dat->dim);
+  } else {
+    int init = OP_export_nonexec_permap[arg.map->index]->size;
+    int copy_size = OP_import_nonexec_permap[arg.map->index]->size;
+
+    hipLaunchKernelGGL(import_halo_scatter_partial, blocks, threads, 0, 0, 
+        import_nonexec_list_partial_d[arg.map->index], arg.data_d, copy_size,
+        arg.dat->size, arg.dat->buffer_d + init * arg.dat->size, arg.dat->dim);
+  }
+}
+
+__device__ int lower_bound(int *disps, int count, int value) {
+  int *it;
+  int *first = disps;
+  int step;
+  while (count > 0) {
+    it = first; 
+    step = count / 2; 
+    it += step;
+    if (*it < value) {
+        first = ++it; 
+        count -= step + 1; 
+    }
+    else
+        count = step;
+  }
+  return first-disps;
+}
+
+__global__ void gather_data_to_buffer_ptr_cuda_kernel(const char *__restrict data, char *__restrict buffer, int *elem_list, int *disps, 
+          unsigned *neigh_to_neigh_offsets, int rank_size, int soa, int type_size, int dim, int set_size) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id >= disps[rank_size]) return;
+  int neighbour = lower_bound(disps, rank_size, id);
+  if (disps[neighbour]!=id) neighbour--;
+  unsigned buf_pos = neigh_to_neigh_offsets[neighbour];
+  unsigned set_elem_index = elem_list[id];
+  if (soa) {
+    for (int d = 0; d < dim; d++)
+      if (type_size == 8 && (buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size)%8==0) 
+        *(double*)&buffer[buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size] = *(double*)&data[(d*set_size + set_elem_index)*type_size];
+      else
+        for (int p = 0; p < type_size; p++)
+          buffer[buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size + p] = data[(d*set_size + set_elem_index)*type_size + p];
+
+  } else {
+    int dat_size = type_size * dim;
+    if (type_size == 8 && (buf_pos + (id - disps[neighbour]) * dat_size)%8==0) 
+      for (int d = 0; d < dim; d++)
+        *(double*)&buffer[buf_pos + (id - disps[neighbour]) * dat_size + d*type_size] = *(double*)&data[set_elem_index*dat_size + d*type_size];
+    else
+      for (int p = 0; p < dat_size; p++)
+        buffer[buf_pos + (id - disps[neighbour]) * dat_size + p] = data[set_elem_index*dat_size + p];
+  }
+}
+
+__global__ void scatter_data_from_buffer_ptr_cuda_kernel(char * __restrict data, const char * __restrict buffer, int *disps, 
+  unsigned *neigh_to_neigh_offsets, int rank_size, int soa, int type_size, int dim, int set_size) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id >= disps[rank_size]) return;
+  int neighbour = lower_bound(disps, rank_size, id);
+  if (disps[neighbour]!=id) neighbour--;
+  unsigned buf_pos = neigh_to_neigh_offsets[neighbour];
+  if (soa) {
+    for (int d = 0; d < dim; d++)
+      if (type_size == 8 && (buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size)%8==0) 
+        *(double*)&data[(d*set_size + id)*type_size] = *(double*)&buffer[buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size];
+      else
+        for (int p = 0; p < type_size; p++)
+          data[(d*set_size + id)*type_size + p] = buffer[buf_pos + (id - disps[neighbour]) * type_size * dim + d * type_size + p];
+  } else {
+    int dat_size = type_size * dim;
+    // if (*(double*)&buffer[buf_pos + (id - disps[neighbour]) * dat_size] != *(double*)&data[id*dat_size])
+    //   printf("Mismatch\n");
+    if (type_size == 8 && (buf_pos + (id - disps[neighbour]) * dat_size)%8==0) 
+      for (int d = 0; d < dim; d++)
+        *(double*)&data[id*dat_size + d*type_size] =  *(double*)&buffer[buf_pos + (id - disps[neighbour]) * dat_size + d*type_size];
+    else
+      for (int p = 0; p < dat_size; p++)
+        data[id*dat_size + p] = buffer[buf_pos + (id - disps[neighbour]) * dat_size + p];
+  }
+}
+
+
+unsigned *op2_grp_neigh_to_neigh_offsets_h = NULL;
+unsigned *op2_grp_neigh_to_neigh_offsets_d = NULL;
+int op2_grp_max_gathers = 10;
+extern int op2_grp_counter;
+int op2_grp_max_neighbours = 0;
+
+void check_realloc_buffer() {
+  //Figure out how much space may need at most 
+  if (op2_grp_neigh_to_neigh_offsets_h == NULL) {
+    for (int i = 0; i < OP_set_index; i++) {
+      op2_grp_max_neighbours = MAX(op2_grp_max_neighbours,OP_export_exec_list[i]->ranks_size);
+      op2_grp_max_neighbours = MAX(op2_grp_max_neighbours,OP_export_nonexec_list[i]->ranks_size);
+      op2_grp_max_neighbours = MAX(op2_grp_max_neighbours,OP_import_exec_list[i]->ranks_size);
+      op2_grp_max_neighbours = MAX(op2_grp_max_neighbours,OP_import_nonexec_list[i]->ranks_size);
+    }
+    //Need host buffers for each dat in flight
+    cutilSafeCall(hipHostMalloc(&op2_grp_neigh_to_neigh_offsets_h, op2_grp_max_gathers * op2_grp_max_neighbours * sizeof(unsigned)));
+    //But just one device buffer if gather kernels are sequential
+    cutilSafeCall(hipMalloc    (&op2_grp_neigh_to_neigh_offsets_d, op2_grp_max_neighbours * sizeof(unsigned)));
+  }
+  if (op2_grp_counter >= op2_grp_max_gathers) {
+    cutilSafeCall(hipDeviceSynchronize());
+    cutilSafeCall(hipHostFree(op2_grp_neigh_to_neigh_offsets_h));
+    op2_grp_max_gathers *= 2;
+    cutilSafeCall(hipHostMalloc(&op2_grp_neigh_to_neigh_offsets_h, op2_grp_max_gathers * op2_grp_max_neighbours * sizeof(unsigned)));
+  }
+}
+
+void gather_data_to_buffer_ptr_cuda(op_arg arg, halo_list eel, halo_list enl, char *buffer, 
+  std::vector<int>& neigh_list, std::vector<unsigned>& neigh_offsets) {
+
+  check_realloc_buffer();
+
+  int soa = 0;
+  if ((OP_auto_soa && arg.dat->dim > 1) || strstr(arg.dat->type, ":soa") != NULL) soa = 1;
+
+  //Exec halo
+
+  //Create op2_grp_neigh_to_neigh_offsets_h into appropriate position
+  for (int i = 0; i < eel->ranks_size; i++) {
+    int dest_rank = eel->ranks[i];
+    int buf_rankpos = std::distance(neigh_list.begin(),std::lower_bound(neigh_list.begin(), neigh_list.end(), dest_rank));
+    op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours+i] = neigh_offsets[buf_rankpos];
+    neigh_offsets[buf_rankpos] += eel->sizes[i] * arg.dat->size;
+  }
+  //Async upload
+  hipMemcpyAsync(op2_grp_neigh_to_neigh_offsets_d,&op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours],eel->ranks_size * sizeof(unsigned),hipMemcpyHostToDevice);
+  //Launch kernel
+  gather_data_to_buffer_ptr_cuda_kernel<<<1 + ((eel->size - 1) / 192),192>>>(arg.dat->data_d, buffer, export_exec_list_d[arg.dat->set->index], export_exec_list_disps_d[arg.dat->set->index], 
+    op2_grp_neigh_to_neigh_offsets_d, eel->ranks_size, soa, arg.dat->size/arg.dat->dim, arg.dat->dim, arg.dat->set->size+arg.dat->set->exec_size+arg.dat->set->nonexec_size);
+  op2_grp_counter++;
+
+  //Same for nonexec
+
+  //Create op2_grp_neigh_to_neigh_offsets_h into appropriate position
+  for (int i = 0; i < enl->ranks_size; i++) {
+    int dest_rank = enl->ranks[i];
+    int buf_rankpos = std::distance(neigh_list.begin(),std::lower_bound(neigh_list.begin(), neigh_list.end(), dest_rank));
+    op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours+i] = neigh_offsets[buf_rankpos];
+    neigh_offsets[buf_rankpos] += enl->sizes[i] * arg.dat->size;
+  }
+  //Async upload
+  hipMemcpyAsync(op2_grp_neigh_to_neigh_offsets_d,&op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours],enl->ranks_size * sizeof(unsigned),hipMemcpyHostToDevice);
+  //Launch kernel
+  gather_data_to_buffer_ptr_cuda_kernel<<<1 + ((enl->size - 1) / 192),192>>>(arg.dat->data_d, buffer, export_nonexec_list_d[arg.dat->set->index], export_nonexec_list_disps_d[arg.dat->set->index], 
+    op2_grp_neigh_to_neigh_offsets_d, enl->ranks_size, soa, arg.dat->size/arg.dat->dim, arg.dat->dim, arg.dat->set->size+arg.dat->set->exec_size+arg.dat->set->nonexec_size);
+
+  op2_grp_counter++;
+
+}
+
+void scatter_data_from_buffer_ptr_cuda(op_arg arg, halo_list iel, halo_list inl, char *buffer, 
+  std::vector<int>& neigh_list, std::vector<unsigned>& neigh_offsets) {
+
+  check_realloc_buffer();
+
+  int soa = 0;
+  if ((OP_auto_soa && arg.dat->dim > 1) || strstr(arg.dat->type, ":soa") != NULL) soa = 1;
+
+  //Exec halo
+
+  //Create op2_grp_neigh_to_neigh_offsets_h into appropriate position
+  for (int i = 0; i < iel->ranks_size; i++) {
+    int dest_rank = iel->ranks[i];
+    int buf_rankpos = std::distance(neigh_list.begin(),std::lower_bound(neigh_list.begin(), neigh_list.end(), dest_rank));
+    op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours+i] = neigh_offsets[buf_rankpos];
+    neigh_offsets[buf_rankpos] += iel->sizes[i] * arg.dat->size;
+  }
+  //Async upload
+  hipMemcpyAsync(op2_grp_neigh_to_neigh_offsets_d,&op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours],iel->ranks_size * sizeof(unsigned),hipMemcpyHostToDevice,op2_grp_secondary);
+  //Launch kernel
+  unsigned offset = arg.dat->set->size * (soa?arg.dat->size/arg.dat->dim:arg.dat->size);
+  scatter_data_from_buffer_ptr_cuda_kernel<<<1 + ((iel->size - 1) / 192),192,0,op2_grp_secondary>>>(arg.dat->data_d+offset, buffer, import_exec_list_disps_d[arg.dat->set->index], 
+    op2_grp_neigh_to_neigh_offsets_d, iel->ranks_size, soa, arg.dat->size/arg.dat->dim, arg.dat->dim, arg.dat->set->size+arg.dat->set->exec_size+arg.dat->set->nonexec_size);
+  op2_grp_counter++;
+
+  //Same for nonexec
+
+  //Create op2_grp_neigh_to_neigh_offsets_h into appropriate position
+  for (int i = 0; i < inl->ranks_size; i++) {
+    int dest_rank = inl->ranks[i];
+    int buf_rankpos = std::distance(neigh_list.begin(),std::lower_bound(neigh_list.begin(), neigh_list.end(), dest_rank));
+    op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours+i] = neigh_offsets[buf_rankpos];
+    neigh_offsets[buf_rankpos] += inl->sizes[i] * arg.dat->size;
+  }
+  //Async upload
+  hipMemcpyAsync(op2_grp_neigh_to_neigh_offsets_d,&op2_grp_neigh_to_neigh_offsets_h[op2_grp_counter*op2_grp_max_neighbours],inl->ranks_size * sizeof(unsigned),hipMemcpyHostToDevice,op2_grp_secondary);
+  //Launch kernel
+  offset = (arg.dat->set->size + iel->size) * (soa?arg.dat->size/arg.dat->dim:arg.dat->size);
+  scatter_data_from_buffer_ptr_cuda_kernel<<<1 + ((inl->size - 1) / 192),192,0,op2_grp_secondary>>>(arg.dat->data_d+offset, buffer, import_nonexec_list_disps_d[arg.dat->set->index], 
+    op2_grp_neigh_to_neigh_offsets_d, inl->ranks_size, soa, arg.dat->size/arg.dat->dim, arg.dat->dim, arg.dat->set->size+arg.dat->set->exec_size+arg.dat->set->nonexec_size);
+
+  op2_grp_counter++;
+
+}
diff --git a/op2/src/mpi/op_mpi_hip_rt_support.cpp b/op2/src/mpi/op_mpi_hip_rt_support.cpp
new file mode 100644
index 000000000..655cb25c2
--- /dev/null
+++ b/op2/src/mpi/op_mpi_hip_rt_support.cpp
@@ -0,0 +1,763 @@
+/*
+ * Open source copyright declaration based on BSD open source template:
+ * http://www.opensource.org/licenses/bsd-license.php
+ *
+ * This file is part of the OP2 distribution.
+ *
+ * Copyright (c) 2011, Mike Giles and others. Please see the AUTHORS file in
+ * the main source directory for a full list of copyright holders.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of Mike Giles may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Mike Giles ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Mike Giles BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// This file implements the MPI+HIP-specific run-time support functions
+//
+
+//
+// header files
+//
+
+#include <mpi.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#include <op_hip_rt_support.h>
+#include <op_lib_c.h>
+#include <op_lib_core.h>
+#include <op_rt_support.h>
+
+#include <op_lib_mpi.h>
+#include <op_util.h>
+
+// Small re-declaration to avoid using struct in the C version.
+// This is due to the different way in which C and C++ see structs
+
+typedef struct hipDeviceProp_t cudaDeviceProp_t;
+
+//
+// export lists on the device
+//
+
+int **export_exec_list_d = NULL;
+int **export_nonexec_list_d = NULL;
+int **export_exec_list_disps_d = NULL;
+int **export_nonexec_list_disps_d = NULL;
+int **export_nonexec_list_partial_d = NULL;
+int **import_nonexec_list_partial_d = NULL;
+int **import_exec_list_disps_d = NULL;
+int **import_nonexec_list_disps_d = NULL;
+
+hipEvent_t op2_grp_download_event;
+hipStream_t op2_grp_secondary;
+
+void cutilDeviceInit(int argc, char **argv) {
+  (void)argc;
+  (void)argv;
+  int deviceCount;
+  cutilSafeCall(hipGetDeviceCount(&deviceCount));
+  if (deviceCount == 0) {
+    printf("cutil error: no devices supporting HIP\n");
+    exit(-1);
+  }
+  printf("Trying to select a device\n");
+
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  // no need to ardcode this following, can be done via numawrap scripts
+  /*if (getenv("OMPI_COMM_WORLD_LOCAL_RANK")!=NULL) {
+    rank = atoi(getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
+  } else if (getenv("MV2_COMM_WORLD_LOCAL_RANK")!=NULL) {
+    rank = atoi(getenv("MV2_COMM_WORLD_LOCAL_RANK"));
+  } else if (getenv("MPI_LOCALRANKID")!=NULL) {
+    rank = atoi(getenv("MPI_LOCALRANKID"));
+  } else {
+    rank = rank%deviceCount;
+  }*/
+
+  // Test we have access to a device
+
+  // This commented out test does not work with HIP versions above 6.5
+  /*float *test;
+  hipError_t err = hipMalloc((void **)&test, sizeof(float));
+  if (err != hipSuccess) {
+    OP_hybrid_gpu = 0;
+  } else {
+    OP_hybrid_gpu = 1;
+  }
+  if (OP_hybrid_gpu) {
+    hipFree(test);
+
+    cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferL1));
+
+    int deviceId = -1;
+    hipGetDevice(&deviceId);
+    cudaDeviceProp_t deviceProp;
+    cutilSafeCall ( hipGetDeviceProperties ( &deviceProp, deviceId ) );
+    printf ( "\n Using HIP device: %d %s on rank %d\n",deviceId,
+  deviceProp.name,rank );
+  } else {
+    printf ( "\n Using CPU on rank %d\n",rank );
+  }*/
+  //omp_set_default_device(rank);
+//  hipError_t err = hipSetDevice(rank);
+  float *test;
+  OP_hybrid_gpu = 0;
+  //hipError_t err = hipMalloc((void **)&test, sizeof(float));
+  for (int i = 0; i < deviceCount; i++) {
+    hipError_t err = hipSetDevice((i+rank)%deviceCount);
+    if (err == hipSuccess) {
+      hipError_t err = hipMalloc((void **)&test, sizeof(float));
+      if (err == hipSuccess) {
+        OP_hybrid_gpu = 1;
+        break;
+      }
+    }
+  }
+  if (OP_hybrid_gpu) {
+    cutilSafeCall(hipFree(test));
+
+    cutilSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferL1));
+
+    int deviceId = -1;
+    hipGetDevice(&deviceId);
+    cudaDeviceProp_t deviceProp;
+    cutilSafeCall(hipGetDeviceProperties(&deviceProp, deviceId));
+    printf("\n Using HIP device: %d %s on rank %d\n", deviceId,
+           deviceProp.name, rank);
+    cutilSafeCall(hipStreamCreateWithFlags(&op2_grp_secondary, hipStreamNonBlocking));
+    cutilSafeCall(hipEventCreateWithFlags(&op2_grp_download_event, hipEventDisableTiming));
+  } else {
+    printf("\n Using CPU on rank %d\n", rank);
+  }
+}
+
+void op_upload_dat(op_dat dat) {
+  if (OP_import_exec_list==NULL) return;
+  //  printf("Uploading %s\n", dat->name);
+  int set_size = dat->set->size + OP_import_exec_list[dat->set->index]->size +
+                 OP_import_nonexec_list[dat->set->index]->size;
+  if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+    char *temp_data = (char *)xmalloc(dat->size * set_size * sizeof(char));
+    int element_size = dat->size / dat->dim;
+    for (int i = 0; i < dat->dim; i++) {
+      for (int j = 0; j < set_size; j++) {
+        for (int c = 0; c < element_size; c++) {
+          temp_data[element_size * i * set_size + element_size * j + c] =
+              dat->data[dat->size * j + element_size * i + c];
+        }
+      }
+    }
+    cutilSafeCall(hipMemcpy(dat->data_d, temp_data, set_size * dat->size,
+                             hipMemcpyHostToDevice));
+    free(temp_data);
+  } else {
+    cutilSafeCall(hipMemcpy(dat->data_d, dat->data, set_size * dat->size,
+                             hipMemcpyHostToDevice));
+  }
+}
+
+void op_download_dat(op_dat dat) {
+  //Check if partitionig is done
+  if (OP_import_exec_list==NULL) return;
+  //  printf("Downloading %s\n", dat->name);
+  int set_size = dat->set->size + OP_import_exec_list[dat->set->index]->size +
+                 OP_import_nonexec_list[dat->set->index]->size;
+  if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+    char *temp_data = (char *)xmalloc(dat->size * set_size * sizeof(char));
+    cutilSafeCall(hipMemcpy(temp_data, dat->data_d, set_size * dat->size,
+                             hipMemcpyDeviceToHost));
+    int element_size = dat->size / dat->dim;
+    for (int i = 0; i < dat->dim; i++) {
+      for (int j = 0; j < set_size; j++) {
+        for (int c = 0; c < element_size; c++) {
+          dat->data[dat->size * j + element_size * i + c] =
+              temp_data[element_size * i * set_size + element_size * j + c];
+        }
+      }
+    }
+    free(temp_data);
+  } else {
+    cutilSafeCall(hipMemcpy(dat->data, dat->data_d, set_size * dat->size,
+                             hipMemcpyDeviceToHost));
+  }
+}
+
+void op_exchange_halo_cuda(op_arg *arg, int exec_flag) {
+  op_dat dat = arg->dat;
+
+  if (arg->sent == 1) {
+    printf("Error: Halo exchange already in flight for dat %s\n", dat->name);
+    fflush(stdout);
+    MPI_Abort(OP_MPI_WORLD, 2);
+  }
+
+  // For a directly accessed op_dat do not do halo exchanges if not executing
+  // over
+  // redundant compute block
+  if (exec_flag == 0 && arg->idx == -1)
+    return;
+
+  arg->sent = 0; // reset flag
+  // need to exchange both direct and indirect data sets if they are dirty
+  if ((arg->opt) &&
+      (arg->acc == OP_READ ||
+       arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) &&
+      (dat->dirtybit == 1)) {
+
+    halo_list imp_exec_list = OP_import_exec_list[dat->set->index];
+    halo_list imp_nonexec_list = OP_import_nonexec_list[dat->set->index];
+
+    halo_list exp_exec_list = OP_export_exec_list[dat->set->index];
+    halo_list exp_nonexec_list = OP_export_nonexec_list[dat->set->index];
+
+    //-------first exchange exec elements related to this data array--------
+
+    // sanity checks
+    if (compare_sets(imp_exec_list->set, dat->set) == 0) {
+      printf("Error: Import list and set mismatch\n");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+    if (compare_sets(exp_exec_list->set, dat->set) == 0) {
+      printf("Error: Export list and set mismatch\n");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+
+    gather_data_to_buffer(*arg, exp_exec_list, exp_nonexec_list);
+
+    char *outptr_exec = NULL;
+    char *outptr_nonexec = NULL;
+    if (OP_gpu_direct) {
+      outptr_exec = arg->dat->buffer_d;
+      outptr_nonexec =
+          arg->dat->buffer_d + exp_exec_list->size * arg->dat->size;
+      cutilSafeCall(hipDeviceSynchronize());
+    } else {
+      cutilSafeCall(hipMemcpy(
+          ((op_mpi_buffer)(dat->mpi_buffer))->buf_exec, arg->dat->buffer_d,
+          exp_exec_list->size * arg->dat->size, hipMemcpyDeviceToHost));
+
+      cutilSafeCall(hipMemcpy(
+          ((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec,
+          arg->dat->buffer_d + exp_exec_list->size * arg->dat->size,
+          exp_nonexec_list->size * arg->dat->size, hipMemcpyDeviceToHost));
+
+      cutilSafeCall(hipDeviceSynchronize());
+      outptr_exec = ((op_mpi_buffer)(dat->mpi_buffer))->buf_exec;
+      outptr_nonexec = ((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec;
+    }
+
+    for (int i = 0; i < exp_exec_list->ranks_size; i++) {
+      MPI_Isend(&outptr_exec[exp_exec_list->disps[i] * dat->size],
+                dat->size * exp_exec_list->sizes[i], MPI_CHAR,
+                exp_exec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
+    }
+
+    int init = dat->set->size * dat->size;
+    char *ptr = NULL;
+    for (int i = 0; i < imp_exec_list->ranks_size; i++) {
+      ptr = OP_gpu_direct
+                ? &(dat->data_d[init + imp_exec_list->disps[i] * dat->size])
+                : &(dat->data[init + imp_exec_list->disps[i] * dat->size]);
+      if (OP_gpu_direct && (strstr(arg->dat->type, ":soa") != NULL ||
+                            (OP_auto_soa && arg->dat->dim > 1)))
+        ptr = dat->buffer_d_r + imp_exec_list->disps[i] * dat->size;
+      MPI_Irecv(ptr, dat->size * imp_exec_list->sizes[i], MPI_CHAR,
+                imp_exec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
+    }
+
+    //-----second exchange nonexec elements related to this data array------
+    // sanity checks
+    if (compare_sets(imp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Non-Import list and set mismatch");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+    if (compare_sets(exp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Non-Export list and set mismatch");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+
+    for (int i = 0; i < exp_nonexec_list->ranks_size; i++) {
+      MPI_Isend(&outptr_nonexec[exp_nonexec_list->disps[i] * dat->size],
+                dat->size * exp_nonexec_list->sizes[i], MPI_CHAR,
+                exp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
+    }
+
+    int nonexec_init = (dat->set->size + imp_exec_list->size) * dat->size;
+    for (int i = 0; i < imp_nonexec_list->ranks_size; i++) {
+      ptr = OP_gpu_direct
+                ? &(dat->data_d[nonexec_init +
+                                imp_nonexec_list->disps[i] * dat->size])
+                : &(dat->data[nonexec_init +
+                              imp_nonexec_list->disps[i] * dat->size]);
+      if (OP_gpu_direct && (strstr(arg->dat->type, ":soa") != NULL ||
+                            (OP_auto_soa && arg->dat->dim > 1)))
+        ptr = dat->buffer_d_r +
+              (imp_exec_list->size + imp_exec_list->disps[i]) * dat->size;
+      MPI_Irecv(ptr, dat->size * imp_nonexec_list->sizes[i], MPI_CHAR,
+                imp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
+    }
+
+    // clear dirty bit
+    dat->dirtybit = 0;
+    arg->sent = 1;
+  }
+}
+
+void op_exchange_halo_partial_cuda(op_arg *arg, int exec_flag) {
+  op_dat dat = arg->dat;
+
+  if (arg->sent == 1) {
+    printf("Error: Halo exchange already in flight for dat %s\n", dat->name);
+    fflush(stdout);
+    MPI_Abort(OP_MPI_WORLD, 2);
+  }
+
+  // For a directly accessed op_dat do not do halo exchanges if not executing
+  // over
+  // redundant compute block
+  if (exec_flag == 0 && arg->idx == -1)
+    return;
+
+  arg->sent = 0; // reset flag
+  // need to exchange both direct and indirect data sets if they are dirty
+  if ((arg->opt) &&
+      (arg->acc == OP_READ ||
+       arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) &&
+      (dat->dirtybit == 1)) {
+
+    halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index];
+    halo_list exp_nonexec_list = OP_export_nonexec_permap[arg->map->index];
+
+    //-------first exchange exec elements related to this data array--------
+
+    // sanity checks
+    if (compare_sets(imp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Import list and set mismatch\n");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+    if (compare_sets(exp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Export list and set mismatch\n");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+
+    gather_data_to_buffer_partial(*arg, exp_nonexec_list);
+
+    char *outptr_nonexec = NULL;
+    if (OP_gpu_direct) {
+      outptr_nonexec = arg->dat->buffer_d;
+      cutilSafeCall(hipDeviceSynchronize());
+    } else {
+      cutilSafeCall(hipMemcpy(
+          ((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec, arg->dat->buffer_d,
+          exp_nonexec_list->size * arg->dat->size, hipMemcpyDeviceToHost));
+
+      cutilSafeCall(hipDeviceSynchronize());
+      outptr_nonexec = ((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec;
+    }
+
+    //-----second exchange nonexec elements related to this data array------
+    // sanity checks
+    if (compare_sets(imp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Non-Import list and set mismatch");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+    if (compare_sets(exp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Non-Export list and set mismatch");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+
+    for (int i = 0; i < exp_nonexec_list->ranks_size; i++) {
+      MPI_Isend(&outptr_nonexec[exp_nonexec_list->disps[i] * dat->size],
+                dat->size * exp_nonexec_list->sizes[i], MPI_CHAR,
+                exp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
+    }
+
+    int nonexec_init = OP_export_nonexec_permap[arg->map->index]->size;
+    for (int i = 0; i < imp_nonexec_list->ranks_size; i++) {
+      char *ptr =
+          OP_gpu_direct
+              ? &arg->dat
+                     ->buffer_d[(nonexec_init + imp_nonexec_list->disps[i]) *
+                                dat->size]
+              : &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->buf_nonexec[(nonexec_init + imp_nonexec_list->disps[i]) *
+                                   dat->size];
+      MPI_Irecv(ptr, dat->size * imp_nonexec_list->sizes[i], MPI_CHAR,
+                imp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
+    }
+
+    arg->sent = 1;
+  }
+}
+
+void op_exchange_halo(op_arg *arg, int exec_flag) {
+  op_dat dat = arg->dat;
+
+  if (exec_flag == 0 && arg->idx == -1)
+    return;
+  if (arg->opt == 0)
+    return;
+
+  if (arg->sent == 1) {
+    printf("Error: Halo exchange already in flight for dat %s\n", dat->name);
+    fflush(stdout);
+    MPI_Abort(OP_MPI_WORLD, 2);
+  }
+
+  // need to exchange both direct and indirect data sets if they are dirty
+  if ((arg->acc == OP_READ ||
+       arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) &&
+      (dat->dirtybit == 1)) {
+    // printf("Exchanging Halo of data array %10s\n",dat->name);
+    halo_list imp_exec_list = OP_import_exec_list[dat->set->index];
+    halo_list imp_nonexec_list = OP_import_nonexec_list[dat->set->index];
+
+    halo_list exp_exec_list = OP_export_exec_list[dat->set->index];
+    halo_list exp_nonexec_list = OP_export_nonexec_list[dat->set->index];
+
+    //-------first exchange exec elements related to this data array--------
+
+    // sanity checks
+    if (compare_sets(imp_exec_list->set, dat->set) == 0) {
+      printf("Error: Import list and set mismatch\n");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+    if (compare_sets(exp_exec_list->set, dat->set) == 0) {
+      printf("Error: Export list and set mismatch\n");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+
+    int set_elem_index;
+    for (int i = 0; i < exp_exec_list->ranks_size; i++) {
+      for (int j = 0; j < exp_exec_list->sizes[i]; j++) {
+        set_elem_index = exp_exec_list->list[exp_exec_list->disps[i] + j];
+        memcpy(&((op_mpi_buffer)(dat->mpi_buffer))
+                    ->buf_exec[exp_exec_list->disps[i] * dat->size +
+                               j * dat->size],
+               (void *)&dat->data[dat->size * (set_elem_index)], dat->size);
+      }
+      MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer))
+                     ->buf_exec[exp_exec_list->disps[i] * dat->size],
+                dat->size * exp_exec_list->sizes[i], MPI_CHAR,
+                exp_exec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
+    }
+
+    int init = dat->set->size * dat->size;
+    for (int i = 0; i < imp_exec_list->ranks_size; i++) {
+      MPI_Irecv(&(dat->data[init + imp_exec_list->disps[i] * dat->size]),
+                dat->size * imp_exec_list->sizes[i], MPI_CHAR,
+                imp_exec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
+    }
+
+    //-----second exchange nonexec elements related to this data array------
+    // sanity checks
+    if (compare_sets(imp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Non-Import list and set mismatch");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+    if (compare_sets(exp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Non-Export list and set mismatch");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+
+    for (int i = 0; i < exp_nonexec_list->ranks_size; i++) {
+      for (int j = 0; j < exp_nonexec_list->sizes[i]; j++) {
+        set_elem_index = exp_nonexec_list->list[exp_nonexec_list->disps[i] + j];
+        memcpy(&((op_mpi_buffer)(dat->mpi_buffer))
+                    ->buf_nonexec[exp_nonexec_list->disps[i] * dat->size +
+                                  j * dat->size],
+               (void *)&dat->data[dat->size * (set_elem_index)], dat->size);
+      }
+      MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer))
+                     ->buf_nonexec[exp_nonexec_list->disps[i] * dat->size],
+                dat->size * exp_nonexec_list->sizes[i], MPI_CHAR,
+                exp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
+    }
+
+    int nonexec_init = (dat->set->size + imp_exec_list->size) * dat->size;
+    for (int i = 0; i < imp_nonexec_list->ranks_size; i++) {
+      MPI_Irecv(
+          &(dat->data[nonexec_init + imp_nonexec_list->disps[i] * dat->size]),
+          dat->size * imp_nonexec_list->sizes[i], MPI_CHAR,
+          imp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD,
+          &((op_mpi_buffer)(dat->mpi_buffer))
+               ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
+    }
+    // clear dirty bit
+    dat->dirtybit = 0;
+    arg->sent = 1;
+  }
+}
+
+void op_exchange_halo_partial(op_arg *arg, int exec_flag) {
+  op_dat dat = arg->dat;
+
+  if (arg->opt == 0)
+    return;
+
+  if (arg->sent == 1) {
+    printf("Error: Halo exchange already in flight for dat %s\n", dat->name);
+    fflush(stdout);
+    MPI_Abort(OP_MPI_WORLD, 2);
+  }
+  arg->sent = 0; // reset flag
+
+  // need to exchange indirect data sets if they are dirty
+  if ((arg->acc == OP_READ ||
+       arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) &&
+      (dat->dirtybit == 1)) {
+    int rank;
+    MPI_Comm_rank(OP_MPI_WORLD, &rank);
+    halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index];
+    halo_list exp_nonexec_list = OP_export_nonexec_permap[arg->map->index];
+    //-------exchange nonexec elements related to this data array and
+    // map--------
+
+    // sanity checks
+    if (compare_sets(imp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Import list and set mismatch\n");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+    if (compare_sets(exp_nonexec_list->set, dat->set) == 0) {
+      printf("Error: Export list and set mismatch\n");
+      MPI_Abort(OP_MPI_WORLD, 2);
+    }
+
+    int set_elem_index;
+    for (int i = 0; i < exp_nonexec_list->ranks_size; i++) {
+      for (int j = 0; j < exp_nonexec_list->sizes[i]; j++) {
+        set_elem_index = exp_nonexec_list->list[exp_nonexec_list->disps[i] + j];
+        memcpy(&((op_mpi_buffer)(dat->mpi_buffer))
+                    ->buf_nonexec[exp_nonexec_list->disps[i] * dat->size +
+                                  j * dat->size],
+               (void *)&dat->data[dat->size * (set_elem_index)], dat->size);
+      }
+      MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer))
+                     ->buf_nonexec[exp_nonexec_list->disps[i] * dat->size],
+                dat->size * exp_nonexec_list->sizes[i], MPI_CHAR,
+                exp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD,
+                &((op_mpi_buffer)(dat->mpi_buffer))
+                     ->s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
+    }
+
+    int init = exp_nonexec_list->size;
+    for (int i = 0; i < imp_nonexec_list->ranks_size; i++) {
+      MPI_Irecv(
+          &((op_mpi_buffer)(dat->mpi_buffer))
+               ->buf_nonexec[(init + imp_nonexec_list->disps[i]) * dat->size],
+          dat->size * imp_nonexec_list->sizes[i], MPI_CHAR,
+          imp_nonexec_list->ranks[i], dat->index, OP_MPI_WORLD,
+          &((op_mpi_buffer)(dat->mpi_buffer))
+               ->r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
+    }
+    // note that we are not settinging the dirtybit to 0, since it's not a full
+    // exchange
+    arg->sent = 1;
+  }
+}
+
+void op_wait_all_cuda(op_arg *arg) {
+  if (arg->opt && arg->argtype == OP_ARG_DAT && arg->sent == 1) {
+    op_dat dat = arg->dat;
+    MPI_Waitall(((op_mpi_buffer)(dat->mpi_buffer))->s_num_req,
+                ((op_mpi_buffer)(dat->mpi_buffer))->s_req, MPI_STATUSES_IGNORE);
+    MPI_Waitall(((op_mpi_buffer)(dat->mpi_buffer))->r_num_req,
+                ((op_mpi_buffer)(dat->mpi_buffer))->r_req, MPI_STATUSES_IGNORE);
+    ((op_mpi_buffer)(dat->mpi_buffer))->s_num_req = 0;
+    ((op_mpi_buffer)(dat->mpi_buffer))->r_num_req = 0;
+
+    if (arg->map != OP_ID && OP_map_partial_exchange[arg->map->index]) {
+      halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index];
+      int nonexec_init = OP_export_nonexec_permap[arg->map->index]->size;
+      ;
+      if (OP_gpu_direct == 0)
+        cutilSafeCall(hipMemcpyAsync(
+            dat->buffer_d + nonexec_init * dat->size,
+            &((op_mpi_buffer)(dat->mpi_buffer))
+                 ->buf_nonexec[nonexec_init * dat->size],
+            imp_nonexec_list->size * dat->size, hipMemcpyHostToDevice, 0));
+      scatter_data_from_buffer_partial(*arg);
+    } else {
+      if (OP_gpu_direct == 0) {
+        if (strstr(arg->dat->type, ":soa") != NULL ||
+            (OP_auto_soa && arg->dat->dim > 1)) {
+          int init = dat->set->size * dat->size;
+          int size = (dat->set->exec_size + dat->set->nonexec_size) * dat->size;
+          cutilSafeCall(hipMemcpyAsync(dat->buffer_d_r, dat->data + init, size,
+                                        hipMemcpyHostToDevice, 0));
+          scatter_data_from_buffer(*arg);
+        } else {
+          int init = dat->set->size * dat->size;
+          cutilSafeCall(
+              hipMemcpyAsync(dat->data_d + init, dat->data + init,
+                              (OP_import_exec_list[dat->set->index]->size +
+                               OP_import_nonexec_list[dat->set->index]->size) *
+                                  arg->dat->size,
+                              hipMemcpyHostToDevice, 0));
+        }
+      } else if (strstr(arg->dat->type, ":soa") != NULL ||
+                 (OP_auto_soa && arg->dat->dim > 1))
+        scatter_data_from_buffer(*arg);
+    }
+    arg->sent = 2; // set flag to indicate completed comm
+  }
+}
+
+void op_wait_all(op_arg *arg) {
+  if (arg->opt && arg->argtype == OP_ARG_DAT && arg->sent == 1) {
+    op_dat dat = arg->dat;
+    MPI_Waitall(((op_mpi_buffer)(dat->mpi_buffer))->s_num_req,
+                ((op_mpi_buffer)(dat->mpi_buffer))->s_req, MPI_STATUSES_IGNORE);
+    MPI_Waitall(((op_mpi_buffer)(dat->mpi_buffer))->r_num_req,
+                ((op_mpi_buffer)(dat->mpi_buffer))->r_req, MPI_STATUSES_IGNORE);
+    ((op_mpi_buffer)(dat->mpi_buffer))->s_num_req = 0;
+    ((op_mpi_buffer)(dat->mpi_buffer))->r_num_req = 0;
+    if (arg->map != OP_ID && OP_map_partial_exchange[arg->map->index]) {
+      halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index];
+      int init = OP_export_nonexec_permap[arg->map->index]->size;
+      char *buffer =
+          &((op_mpi_buffer)(dat->mpi_buffer))->buf_nonexec[init * dat->size];
+      for (int i = 0; i < imp_nonexec_list->size; i++) {
+        int set_elem_index = imp_nonexec_list->list[i];
+        memcpy((void *)&dat->data[dat->size * (set_elem_index)],
+               &buffer[i * dat->size], dat->size);
+      }
+    }
+  }
+  arg->sent = 0;
+}
+
+void op_partition(const char *lib_name, const char *lib_routine,
+                  op_set prime_set, op_map prime_map, op_dat coords) {
+  partition(lib_name, lib_routine, prime_set, prime_map, coords);
+  if (!OP_hybrid_gpu)
+    return;
+  op_move_to_device();
+}
+
+void op_move_to_device() {
+  for (int s = 0; s < OP_set_index; s++) {
+    op_set set = OP_set_list[s];
+    op_dat_entry *item;
+    TAILQ_FOREACH(item, &OP_dat_list, entries) {
+      op_dat dat = item->dat;
+
+      if (dat->set->index == set->index)
+        op_mv_halo_device(set, dat);
+    }
+  }
+
+  for (int m = 0; m < OP_map_index; m++) {
+    // Upload maps in transposed form
+    op_map map = OP_map_list[m];
+    int set_size = map->from->size + map->from->exec_size;
+    int *temp_map = (int *)xmalloc(map->dim * set_size * sizeof(int));
+    for (int i = 0; i < map->dim; i++) {
+      for (int j = 0; j < set_size; j++) {
+        temp_map[i * set_size + j] = map->map[map->dim * j + i];
+      }
+    }
+    op_cpHostToDevice((void **)&(map->map_d), (void **)&(temp_map),
+                      map->dim * set_size * sizeof(int));
+    free(temp_map);
+  }
+
+  op_mv_halo_list_device();
+}
+
+int op_is_root() {
+  int my_rank;
+  MPI_Comm_rank(OP_MPI_WORLD, &my_rank);
+  return (my_rank == MPI_ROOT);
+}
+
+int op2_grp_size_recv_old = 0;
+int op2_grp_size_send_old = 0;
+void op_realloc_comm_buffer(char **send_buffer_host, char **recv_buffer_host, 
+      char **send_buffer_device, char **recv_buffer_device, int device, 
+      unsigned size_send, unsigned size_recv) {
+  if (op2_grp_size_recv_old < size_recv) {
+    //if (*recv_buffer_host != NULL) cutilSafeCall(hipHostFree(*recv_buffer_host));
+    if (*recv_buffer_device != NULL) cutilSafeCall(hipFree(*recv_buffer_device));
+    cutilSafeCall(hipMalloc(recv_buffer_device, size_recv));
+    //cutilSafeCall(hipMallocHost(recv_buffer_host, size_send));
+    if (op2_grp_size_recv_old>0) cutilSafeCall(hipHostUnregister ( *recv_buffer_host ));
+    *recv_buffer_host = (char*)op_realloc(*recv_buffer_host, size_recv);
+    cutilSafeCall(hipHostRegister ( *recv_buffer_host, size_recv, hipHostRegisterDefault ));
+    op2_grp_size_recv_old = size_recv;
+  }
+  if (op2_grp_size_send_old < size_send) {
+    //if (*send_buffer_host != NULL) cutilSafeCall(hipHostFree(*send_buffer_host));
+    if (*send_buffer_device != NULL) cutilSafeCall(hipFree(*send_buffer_device));
+    cutilSafeCall(hipMalloc(send_buffer_device, size_send));
+    //cutilSafeCall(hipMallocHost(send_buffer_host, size_recv));
+    if (op2_grp_size_send_old>0) cutilSafeCall(hipHostUnregister ( *send_buffer_host ));
+    *send_buffer_host = (char*)op_realloc(*send_buffer_host, size_send);
+    cutilSafeCall(hipHostRegister ( *send_buffer_host, size_send, hipHostRegisterDefault ));
+    op2_grp_size_send_old = size_send;
+  }
+}
+
+void op_download_buffer_async(char *send_buffer_device, char *send_buffer_host, unsigned size_send) {
+  //Make sure gather kernels on the 0 stream finished before starting download
+  cutilSafeCall(hipEventRecord(op2_grp_download_event,0));
+  cutilSafeCall(hipStreamWaitEvent(op2_grp_secondary, op2_grp_download_event,0));
+  cutilSafeCall(hipMemcpyAsync(send_buffer_host, send_buffer_device, size_send, hipMemcpyDeviceToHost, op2_grp_secondary));
+}
+void op_upload_buffer_async  (char *recv_buffer_device, char *recv_buffer_host, unsigned size_recv) {
+  cutilSafeCall(hipMemcpyAsync(recv_buffer_device, recv_buffer_host, size_recv, hipMemcpyHostToDevice, op2_grp_secondary));
+}
+
+void op_scatter_sync() {
+  cutilSafeCall(hipEventRecord(op2_grp_download_event, op2_grp_secondary));
+  cutilSafeCall(hipStreamWaitEvent(0, op2_grp_download_event,0));
+}
+void op_download_buffer_sync() {
+  cutilSafeCall(hipStreamSynchronize(op2_grp_secondary));
+}
diff --git a/translator/c/op2.py b/translator/c/op2.py
index 97a9997d2..ade0ccc85 100755
--- a/translator/c/op2.py
+++ b/translator/c/op2.py
@@ -933,6 +933,7 @@ def main(srcFilesAndDirs=sys.argv[1:]):
   #code generators for NVIDIA GPUs with CUDA
   #op2_gen_cuda(masterFile, date, consts, kernels,sets) # Optimized for Fermi GPUs
   op2_gen_cuda_simple(masterFile, date, consts, kernels, sets, macro_defs) # Optimized for Kepler GPUs
+  op2_gen_cuda_simple(masterFile, date, consts, kernels, sets, macro_defs, hip = 1) # Optimized for Kepler GPUs
 
   # generates openmp code as well as cuda code into the same file
   op2_gen_cuda_simple_hyb(masterFile, date, consts, kernels, sets) # CPU and GPU will then do comutations as a hybrid application
diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py
index 436f1537e..d3551aaaf 100644
--- a/translator/c/op2_gen_cuda_simple.py
+++ b/translator/c/op2_gen_cuda_simple.py
@@ -94,11 +94,15 @@ def ENDIF():
   elif CPP:
     code('}')
 
-def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
+def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0):
 
   global dims, idxs, typs, indtyps, inddims
   global FORTRAN, CPP, g_m, file_text, depth
 
+  cuda = 'cuda'
+  if hip == 1:
+    cuda = 'hip'
+
   OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;
 
   OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
@@ -748,10 +752,13 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
               if optflags[g_m]==1:
                 IF('optflags & 1<<'+str(optidxs[g_m]))
               for d in range(0,int(dims[g_m])):
+                atomicAdd = 'atomicAdd'
+                if hip == 1:
+                  atomicAdd = 'unsafeAtomicAdd'
                 if soaflags[g_m]:
-                  code('atomicAdd(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx],<ARG>_l['+str(d)+']);')
+                  code(atomicAdd+'(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx],<ARG>_l['+str(d)+']);')
                 else:
-                  code('atomicAdd(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>],<ARG>_l['+str(d)+']);')
+                  code(atomicAdd+'(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>],<ARG>_l['+str(d)+']);')
               if optflags[g_m]==1:
                 ENDIF()
 
@@ -966,12 +973,12 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
             k = k + [mapnames[g_m]]
             IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
             code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
-            code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));')
+            code(cuda+'MemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));')
             ENDIF()
       if dir_soa!=-1:
           IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
           code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
-          code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));')
+          code(cuda+'MemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));')
           ENDIF()
 
 #
@@ -1241,7 +1248,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
 # update kernel record
 #
 
-    code('cutilSafeCall(cudaDeviceSynchronize());')
+    code('cutilSafeCall('+cuda+'DeviceSynchronize());')
     comm('update kernel record')
     code('op_timers_core(&cpu_t2, &wall_t2);')
     code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')
@@ -1266,9 +1273,9 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
 ##########################################################################
 #  output individual kernel file
 ##########################################################################
-    if not os.path.exists('cuda'):
-        os.makedirs('cuda')
-    fid = open('cuda/'+name+'_kernel.cu','w')
+    if not os.path.exists(cuda):
+        os.makedirs(cuda)
+    fid = open(cuda+'/'+name+'_kernel.'+('cpp' if hip==1 else 'cu'),'w')
     date = datetime.datetime.now()
     fid.write('//\n// auto-generated by op2.py\n//\n\n')
     fid.write(file_text)
@@ -1308,8 +1315,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
     code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif')
     code('#include "../user_types.h"')
   code('#include "op_lib_cpp.h"')
-  code('#include "op_cuda_rt_support.h"')
-  code('#include "op_cuda_reduction.h"')
+  code('#include "op_'+cuda+'_rt_support.h"')
+  code('#include "op_'+cuda+'_reduction.h"')
 
   for nc in range(0,len(consts)):
     code('')
@@ -1321,7 +1328,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
       IF('dim*sizeof('+consts[nc]['type'][1:-1]+')>MAX_CONST_SIZE')
       code('printf("error: MAX_CONST_SIZE not big enough\\n"); exit(1);')
       ENDIF()
-    code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));')
+    code('cutilSafeCall('+cuda+'MemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));')
     depth = depth - 2
     code('}')
 
@@ -1330,10 +1337,10 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
 
   for nk in range(0,len(kernels)):
     file_text = file_text +\
-    '#include "'+kernels[nk]['name']+'_kernel.cu"\n'
+    '#include "'+kernels[nk]['name']+'_kernel.'+('cpp' if hip==1 else 'cu')+'"\n'
 
   master = master.split('.')[0]
-  fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w')
+  fid = open(cuda+'/'+master.split('.')[0]+'_kernels.'+('cpp' if hip==1 else 'cu'),'w')
   fid.write('//\n// auto-generated by op2.py\n//\n\n')
   fid.write(file_text)
   fid.close()

From d680600376074c1f83c6f9ebc5d5615e9de59b98 Mon Sep 17 00:00:00 2001
From: Istvan Reguly <regulyistvan@gmail.com>
Date: Tue, 3 Jan 2023 21:23:50 +0100
Subject: [PATCH 2/5] Fixing bugs for CUDA with SoA and one of volna's kernels
 don't translate to vec

---
 translator/c/op2_gen_cuda_simple.py | 29 +++++++++++++++++++++--------
 translator/c/op2_gen_mpi_vec.py     |  5 ++++-
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py
index d3551aaaf..09c646025 100644
--- a/translator/c/op2_gen_cuda_simple.py
+++ b/translator/c/op2_gen_cuda_simple.py
@@ -200,6 +200,16 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0)
       for i in range(0,nargs):
         inds_staged[i] = inds_staged[i] + 1
 
+    print(name)
+    no_restrict = [0]*nargs
+    for i in range(0,ninds):
+      varname = var[invinds[i]]
+      print(varname)
+      for j in range(0,nargs):
+        if varname == var[j] and maps[j] == OP_ID and soaflags[j]:
+          print('no restrict ' +varname+str(inds))
+          no_restrict[invinds[i]] = 1
+          break
 ##########################################################################
 #  start with CUDA kernel function
 ##########################################################################
@@ -274,12 +284,12 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0)
         return
 
     for i in range(0,nargs_novec):
-        var = signature_text.split(',')[i].strip()
+        var1 = signature_text.split(',')[i].strip()
         if kernels[nk]['soaflags'][i] and (op_color2 or  not (kernels[nk]['maps'][i] == OP_MAP and kernels[nk]['accs'][i] == OP_INC)):
-          var = var.replace('*','')
-          #locate var in body and replace by adding [idx]
-          length = len(re.compile('\\s+\\b').split(var))
-          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()
+          var1 = var1.replace('*','')
+          #locate var1 in body and replace by adding [idx]
+          length = len(re.compile('\\s+\\b').split(var1))
+          var2 = re.compile('\\s+\\b').split(var1)[length-1].strip()
 
           if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
             body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9_]*)\]'+'', var2+r'\1[(\2)*'+op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
@@ -309,10 +319,13 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0)
       code('int optflags,')
 
     for g_m in range(0,ninds):
-      if (indaccs[g_m]==OP_READ):
-        code('const <INDTYP> *__restrict <INDARG>,')
+      restrict = ''
+      if no_restrict[invinds[g_m]]==0:
+        restrict = '__restrict'
+      if indaccs[g_m]==OP_READ:
+        code('const <INDTYP> *'+restrict+' <INDARG>,')
       else:
-        code('<INDTYP> *__restrict <INDARG>,')
+        code('<INDTYP> *'+restrict+' <INDARG>,')
 
     if nmaps > 0:
       k = []
diff --git a/translator/c/op2_gen_mpi_vec.py b/translator/c/op2_gen_mpi_vec.py
index 8dd34d810..9f2ca2e5c 100644
--- a/translator/c/op2_gen_mpi_vec.py
+++ b/translator/c/op2_gen_mpi_vec.py
@@ -803,7 +803,10 @@ def op2_gen_mpi_vec(master, date, consts, kernels):
   comm(' user kernel files')
 
   for nk in range(0,len(kernels)):
-    code('#include "'+kernels[nk]['name']+'_veckernel.cpp"')
+    if 'initBathymetry_large' in kernels[nk]['name']:
+      code('#include "../seq/'+kernels[nk]['name']+'_seqkernel.cpp"')
+    else:
+      code('#include "'+kernels[nk]['name']+'_veckernel.cpp"')
   master = master.split('.')[0]
   fid = open('vec/'+master.split('.')[0]+'_veckernels.cpp','w')
   fid.write('//\n// auto-generated by op2.py\n//\n\n')

From eb7cbe61b4a3b0ee6c69cffb7b4ca8690c74a55b Mon Sep 17 00:00:00 2001
From: Reguly Reguly <regulyistvan@gmail.com>
Date: Fri, 28 Apr 2023 11:06:09 +0300
Subject: [PATCH 3/5] OpenMP 4 patches, atomic version

---
 makefiles/compilers/c/clang.mk            |   4 +
 makefiles/compilers/c/cray.mk             |   4 +-
 op2/include/op_cuda_rt_support.h          |   4 +-
 op2/include/op_hip_rt_support.h           |   4 +-
 op2/src/hip/op_hip_decl.cpp               |  30 +++---
 op2/src/hip/op_hip_rt_support.cpp         |   4 +-
 op2/src/mpi/op_mpi_hip_decl.cpp           |   4 +-
 op2/src/mpi/op_mpi_part_core.cpp          |   6 +-
 op2/src/openmp4/op_openmp4_rt_support.cpp |  11 ++
 translator/c/op2_gen_openmp4.py           | 122 +++++++++++++++-------
 10 files changed, 131 insertions(+), 62 deletions(-)

diff --git a/makefiles/compilers/c/clang.mk b/makefiles/compilers/c/clang.mk
index 345f99000..756c3b2c1 100644
--- a/makefiles/compilers/c/clang.mk
+++ b/makefiles/compilers/c/clang.mk
@@ -4,3 +4,7 @@ CONFIG_CC := clang
 CONFIG_CXX := clang++
 
 CONFIG_CXXLINK ?= -lc++
+
+CONFIG_CPP_HAS_OMP_OFFLOAD ?= true
+OMP_OFFLOAD_CXXFLAGS = -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a
+CONFIG_OMP_OFFLOAD_CXXFLAGS = -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a
diff --git a/makefiles/compilers/c/cray.mk b/makefiles/compilers/c/cray.mk
index 02671eeea..2e41f37b0 100644
--- a/makefiles/compilers/c/cray.mk
+++ b/makefiles/compilers/c/cray.mk
@@ -23,4 +23,6 @@ CONFIG_OMP_CPPFLAGS ?= -fopenmp
 CONFIG_CPP_HAS_OMP ?= true
 
 # CONFIG_OMP_OFFLOAD_CPPFLAGS ?=
-CONFIG_CPP_HAS_OMP_OFFLOAD ?= false
\ No newline at end of file
+CONFIG_CPP_HAS_OMP_OFFLOAD ?= true
+OMP_OFFLOAD_CXXFLAGS = -fopenmp
+CONFIG_OMP_OFFLOAD_CXXFLAGS = -fopenmp
diff --git a/op2/include/op_cuda_rt_support.h b/op2/include/op_cuda_rt_support.h
index f9a761f79..4a7b21b3f 100644
--- a/op2/include/op_cuda_rt_support.h
+++ b/op2/include/op_cuda_rt_support.h
@@ -84,9 +84,9 @@ void cutilDeviceInit_mpi(int argc, char **argv, int mpi_rank);
  * routines to move arrays to/from GPU device
  */
 
-void op_mvHostToDevice(void **map, int size);
+void op_mvHostToDevice(void **map, size_t size);
 
-void op_cpHostToDevice(void **data_d, void **data_h, int size);
+void op_cpHostToDevice(void **data_d, void **data_h, size_t size);
 
 void op_cuda_get_data(op_dat dat);
 
diff --git a/op2/include/op_hip_rt_support.h b/op2/include/op_hip_rt_support.h
index 78eb42908..676b5a82d 100644
--- a/op2/include/op_hip_rt_support.h
+++ b/op2/include/op_hip_rt_support.h
@@ -82,9 +82,9 @@ void cutilDeviceInit_mpi(int argc, char **argv, int mpi_rank);
  * routines to move arrays to/from GPU device
  */
 
-void op_mvHostToDevice(void **map, int size);
+void op_mvHostToDevice(void **map, size_t size);
 
-void op_cpHostToDevice(void **data_d, void **data_h, int size);
+void op_cpHostToDevice(void **data_d, void **data_h, size_t size);
 
 void op_cuda_get_data(op_dat dat);
 
diff --git a/op2/src/hip/op_hip_decl.cpp b/op2/src/hip/op_hip_decl.cpp
index e3b3dc586..dc3203dbf 100644
--- a/op2/src/hip/op_hip_decl.cpp
+++ b/op2/src/hip/op_hip_decl.cpp
@@ -113,20 +113,20 @@ op_dat op_decl_dat_char(op_set set, int dim, char const *type, int size,
   if (strstr(type, ":soa") != NULL || (OP_auto_soa && dim > 1)) {
     char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
     int element_size = dat->size / dat->dim;
-    for (int i = 0; i < dat->dim; i++) {
-      for (int j = 0; j < set_size; j++) {
-        for (int c = 0; c < element_size; c++) {
+    for (size_t i = 0; i < dat->dim; i++) {
+      for (size_t j = 0; j < set_size; j++) {
+        for (size_t c = 0; c < element_size; c++) {
           temp_data[element_size * i * set_size + element_size * j + c] =
               data[dat->size * j + element_size * i + c];
         }
       }
     }
     op_cpHostToDevice((void **)&(dat->data_d), (void **)&(temp_data),
-                      dat->size * set_size);
+                      (size_t)dat->size * set_size);
     free(temp_data);
   } else {
     op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data),
-                      dat->size * set_size);
+                      (size_t)dat->size * set_size);
   }
 
   return dat;
@@ -142,7 +142,7 @@ op_dat op_decl_dat_temp_char(op_set set, int dim, char const *type, int size,
   dat->user_managed = 0;
 
 	op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data),
-                    dat->size * set->size);
+                    (size_t)dat->size * set->size);
 
   return dat;
 }
@@ -162,14 +162,14 @@ op_map op_decl_map(op_set from, op_set to, int dim, int *imap,
                    char const *name) {
   op_map map = op_decl_map_core(from, to, dim, imap, name);
   int set_size = map->from->size + map->from->exec_size;
-  int *temp_map = (int *)malloc(map->dim * set_size * sizeof(int));
-  for (int i = 0; i < map->dim; i++) {
-    for (int j = 0; j < set_size; j++) {
+  int *temp_map = (int *)malloc((size_t)map->dim * set_size * sizeof(int));
+  for (size_t i = 0; i < map->dim; i++) {
+    for (size_t j = 0; j < set_size; j++) {
       temp_map[i * set_size + j] = map->map[map->dim * j + i];
     }
   }
   op_cpHostToDevice((void **)&(map->map_d), (void **)&(temp_map),
-                    map->dim * set_size * sizeof(int));
+                    (size_t)map->dim * set_size * sizeof(int));
   free(temp_map);
   return map;
 }
@@ -304,20 +304,20 @@ void op_upload_all() {
       if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
         char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
         int element_size = dat->size / dat->dim;
-        for (int i = 0; i < dat->dim; i++) {
-          for (int j = 0; j < set_size; j++) {
-            for (int c = 0; c < element_size; c++) {
+        for (size_t i = 0; i < dat->dim; i++) {
+          for (size_t j = 0; j < set_size; j++) {
+            for (size_t c = 0; c < element_size; c++) {
               temp_data[element_size * i * set_size + element_size * j + c] =
                   dat->data[dat->size * j + element_size * i + c];
             }
           }
         }
-        cutilSafeCall(hipMemcpy(dat->data_d, temp_data, dat->size * set_size,
+        cutilSafeCall(hipMemcpy(dat->data_d, temp_data, (size_t)dat->size * set_size,
                                  hipMemcpyHostToDevice));
         dat->dirty_hd = 0;
         free(temp_data);
       } else {
-        cutilSafeCall(hipMemcpy(dat->data_d, dat->data, dat->size * set_size,
+        cutilSafeCall(hipMemcpy(dat->data_d, dat->data, (size_t)dat->size * set_size,
                                  hipMemcpyHostToDevice));
         dat->dirty_hd = 0;
       }
diff --git a/op2/src/hip/op_hip_rt_support.cpp b/op2/src/hip/op_hip_rt_support.cpp
index c0ae1e637..3704124a7 100644
--- a/op2/src/hip/op_hip_rt_support.cpp
+++ b/op2/src/hip/op_hip_rt_support.cpp
@@ -93,7 +93,7 @@ void __cutilCheckMsg(const char *errorMessage, const char *file,
 // routines to move arrays to/from GPU device
 //
 
-void op_mvHostToDevice(void **map, int size) {
+void op_mvHostToDevice(void **map, size_t size) {
   if (!OP_hybrid_gpu || size == 0)
     return;
   void *tmp;
@@ -104,7 +104,7 @@ void op_mvHostToDevice(void **map, int size) {
   *map = tmp;
 }
 
-void op_cpHostToDevice(void **data_d, void **data_h, int size) {
+void op_cpHostToDevice(void **data_d, void **data_h, size_t size) {
   if (!OP_hybrid_gpu)
     return;
   if (*data_d != NULL) cutilSafeCall(hipFree(*data_d));
diff --git a/op2/src/mpi/op_mpi_hip_decl.cpp b/op2/src/mpi/op_mpi_hip_decl.cpp
index 5841e370a..ea6e04ebb 100644
--- a/op2/src/mpi/op_mpi_hip_decl.cpp
+++ b/op2/src/mpi/op_mpi_hip_decl.cpp
@@ -285,7 +285,7 @@ void op_mv_halo_list_device() {
 
     op_cpHostToDevice((void **)&(export_exec_list_d[set->index]),
                       (void **)&(OP_export_exec_list[set->index]->list),
-                      OP_export_exec_list[set->index]->size * sizeof(int));
+                      (size_t)OP_export_exec_list[set->index]->size * sizeof(int));
   }
 
   if (export_nonexec_list_d != NULL) {
@@ -302,7 +302,7 @@ void op_mv_halo_list_device() {
 
     op_cpHostToDevice((void **)&(export_nonexec_list_d[set->index]),
                       (void **)&(OP_export_nonexec_list[set->index]->list),
-                      OP_export_nonexec_list[set->index]->size * sizeof(int));
+                      (size_t)OP_export_nonexec_list[set->index]->size * sizeof(int));
   }
 
   //for grouped, we need the disps array on device too
diff --git a/op2/src/mpi/op_mpi_part_core.cpp b/op2/src/mpi/op_mpi_part_core.cpp
index b68374169..f1efc84b3 100644
--- a/op2/src/mpi/op_mpi_part_core.cpp
+++ b/op2/src/mpi/op_mpi_part_core.cpp
@@ -1083,7 +1083,7 @@ static void migrate_all(int my_rank, int comm_size) {
           //MPI_Isend(sbuf[i], (size_t)dat->size/sizeof(double) * exp->sizes[i], MPI_DOUBLE, exp->ranks[i],
           //          d, OP_PART_WORLD, &request_send[i]);
           if ((size_t)dat->size * exp->sizes[i] > (size_t)INT_MAX) printf("Integer overflow at %s: %d\n",__FILE__,__LINE__);
-          MPI_Isend(sbuf[i], (size_t)dat->size * exp->sizes[i], MPI_CHAR, exp->ranks[i],
+          MPI_Isend(sbuf[i], (size_t)dat->size/8 * exp->sizes[i], MPI_DOUBLE, exp->ranks[i],
                     d, OP_PART_WORLD, &request_send[i]);
         }
 
@@ -1096,8 +1096,8 @@ static void migrate_all(int my_rank, int comm_size) {
           //         MPI_DOUBLE, imp->ranks[i], d, OP_PART_WORLD,
           //         MPI_STATUS_IGNORE);
           if ((size_t)dat->size * imp->sizes[i] > (size_t)INT_MAX) printf("Integer overflow at %s: %d\n",__FILE__,__LINE__);
-          MPI_Recv(&rbuf[(size_t)imp->disps[i] * (size_t)dat->size], (size_t)dat->size * imp->sizes[i],
-                   MPI_CHAR, imp->ranks[i], d, OP_PART_WORLD,
+          MPI_Recv(&rbuf[(size_t)imp->disps[i] * (size_t)dat->size], (size_t)dat->size/8 * imp->sizes[i],
+                   MPI_DOUBLE, imp->ranks[i], d, OP_PART_WORLD,
                    MPI_STATUS_IGNORE);
         }
 
diff --git a/op2/src/openmp4/op_openmp4_rt_support.cpp b/op2/src/openmp4/op_openmp4_rt_support.cpp
index afca7760a..078484915 100644
--- a/op2/src/openmp4/op_openmp4_rt_support.cpp
+++ b/op2/src/openmp4/op_openmp4_rt_support.cpp
@@ -310,6 +310,17 @@ void op_mpi_wait_all_cuda(int nargs, op_arg *args) {
   (void)args;
 }
 
+int op_mpi_halo_exchanges_grouped(op_set set, int nargs, op_arg *args, int device){
+  (void)device;
+  return device == 1 ? op_mpi_halo_exchanges(set, nargs, args) : op_mpi_halo_exchanges_cuda(set, nargs, args);
+}
+
+void op_mpi_wait_all_grouped(int nargs, op_arg *args, int device) {
+  (void)device;
+  (void)nargs;
+  (void)args;
+}
+
 void op_mpi_reset_halos(int nargs, op_arg *args) {
   (void)nargs;
   (void)args;
diff --git a/translator/c/op2_gen_openmp4.py b/translator/c/op2_gen_openmp4.py
index ce9e4f17a..c119975c8 100644
--- a/translator/c/op2_gen_openmp4.py
+++ b/translator/c/op2_gen_openmp4.py
@@ -96,9 +96,10 @@ def op2_gen_openmp4(master, date, consts, kernels):
   OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;
 
   accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]
-  op2_compiler = os.getenv('OP2_COMPILER','0');
-  any_soa = 0
+  op2_compiler = 'clang' #os.getenv('OP2_COMPILER','0');
   maptype = 'map'
+  any_soa=0
+  atomics = True
   for nk in range (0,len(kernels)):
     any_soa = any_soa or sum(kernels[nk]['soaflags'])
 
@@ -210,18 +211,18 @@ def op2_gen_openmp4(master, date, consts, kernels):
 
     for i in range(0,nargs_novec):
         var = signature_text.split(',')[i].strip()
-        if kernels[nk]['soaflags'][i]:
+        if kernels[nk]['soaflags'][i] and not (atomics and maps[i] == OP_MAP and accs[i] == OP_INC):
           var = var.replace('*','')
           #locate var in body and replace by adding [idx]
           length = len(re.compile('\\s+\\b').split(var))
           var2 = re.compile('\\s+\\b').split(var)[length-1].strip()
 
           if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
-            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \
+            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9_]*)\]'+'', var2+r'\1[(\2)*'+ \
                 op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
           else:
             body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
-            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
+            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9_]*)\]'+'', var2+r'[(\1)*'+ \
                 op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
 
     for nc in range(0,len(consts)): 
@@ -280,7 +281,10 @@ def op2_gen_openmp4(master, date, consts, kernels):
         params += indent +  'int dat'+str(g_m)+'size,'
     if ninds>0:
       # add indirect kernel specific params to kernel func call 
-      params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,'
+      if atomics:
+        params += indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,'
+      else:
+        params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,'
     else:
       # add direct kernel specific params to kernel func call 
       params += indent + 'int count,'
@@ -387,7 +391,6 @@ def op2_gen_openmp4(master, date, consts, kernels):
       ENDIF()
       code('')
       comm(' get plan')
-      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')
 
 #
 # direct bit
@@ -398,7 +401,7 @@ def op2_gen_openmp4(master, date, consts, kernels):
       code('printf(" kernel routine w/o indirection:  '+ name + '");')
       ENDIF()
       code('')
-      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')
+    code('int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);')
 
 #
 # get part and block size
@@ -425,7 +428,8 @@ def op2_gen_openmp4(master, date, consts, kernels):
 
     if ninds > 0:
       code('')
-      code('int ncolors = 0;')
+      if not atomics:
+          code('int ncolors = 0;')
       code('int set_size1 = set->size + set->exec_size;')
     code('')
     IF('set_size >0')
@@ -480,19 +484,36 @@ def op2_gen_openmp4(master, date, consts, kernels):
 # prepare kernel params for indirect version
 #
     if ninds>0:
-      code('')
-      code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
-      code('ncolors = Plan->ncolors;')
-      code('int *col_reord = Plan->col_reord;')
-      code('')
-      comm(' execute plan')
-      FOR('col','0','Plan->ncolors')
-      IF('col==1')
-      code('op_mpi_wait_all_cuda(nargs, args);')
-      ENDIF()
-      code('int start = Plan->col_offsets[0][col];')
-      code('int end = Plan->col_offsets[0][col+1];')
-      code('')
+      if atomics:
+        if reduct:
+          FOR('round','0','3')
+        else:
+          FOR('round','0','2')
+        IF('round==1')
+        code('op_mpi_wait_all_grouped(nargs, args, 2);')
+        #code('op_mpi_wait_all_cuda(nargs, args);')
+        ENDIF()
+        if reduct:
+          code('int start = round==0 ? 0 : (round==1 ? set->core_size : set->size);')
+          code('int end = round==0 ? set->core_size : (round==1? set->size :  set->size + set->exec_size);')
+        else:
+          code('int start = round==0 ? 0 : set->core_size;')
+          code('int end = round==0 ? set->core_size : set->size + set->exec_size;')
+        IF('end-start>0')
+      else:
+        code('')
+        code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
+        code('ncolors = Plan->ncolors;')
+        code('int *col_reord = Plan->col_reord;')
+        code('')
+        comm(' execute plan')
+        FOR('col','0','Plan->ncolors')
+        IF('col==1')
+        code('op_mpi_wait_all_grouped(nargs, args, 2);')
+        ENDIF()
+        code('int start = Plan->col_offsets[0][col];')
+        code('int end = Plan->col_offsets[0][col+1];')
+        code('')
 #
 # kernel function call
 #
@@ -501,10 +522,11 @@ def op2_gen_openmp4(master, date, consts, kernels):
     call_params = call_params.replace('*','')
     # set params for indirect version
     if ninds>0:
+      call_params = re.sub(r'\bcount\b','set_size1',call_params);
       call_params = call_params.replace('num_teams','part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread')
     # set params for direct version
     else:
-      call_params = re.sub('count','set->size',call_params);
+      call_params = re.sub(r'\bcount\b','set->size',call_params);
       call_params = call_params.replace('num_teams','part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') 
     code(func_call_signaure_text.split(' ')[-1]+call_params+');')
     code('')
@@ -512,7 +534,10 @@ def op2_gen_openmp4(master, date, consts, kernels):
     if ninds>0:
       if reduct:
         comm(' combine reduction data')
-        IF('col == Plan->ncolors_owned-1')
+        if atomics:
+          IF('round == 1')
+        else:
+          IF('col == Plan->ncolors_owned-1')
         for g_m in range(0,nargs):
           if maps[g_m] == OP_GBL and accs[g_m] != OP_READ:
             if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
@@ -525,16 +550,23 @@ def op2_gen_openmp4(master, date, consts, kernels):
               error('internal error: invalid reduction option')
         ENDIF()
       ENDFOR()
-      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
-      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')
+      if not atomics:
+        code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
+        code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')
 
+    if ninds>0:
+      if atomics:
+        ENDIF()
     ENDIF()
     code('')
 
     #zero set size issues
     if ninds>0:
-      IF('set_size == 0 || set_size == set->core_size || ncolors == 1')
-      code('op_mpi_wait_all_cuda(nargs, args);')
+      if atomics:
+        IF('set_size == 0 || set_size == set->core_size')
+      else:
+        IF('set_size == 0 || set_size == set->core_size || ncolors == 1')
+      code('op_mpi_wait_all_grouped(nargs, args, 2);')
       ENDIF()
 
 #
@@ -624,7 +656,7 @@ def op2_gen_openmp4(master, date, consts, kernels):
     line = '#pragma omp target teams'
     if op2_compiler == 'clang':
       line +=' distribute parallel for schedule(static,1)\\\n' + (depth+2)*' '
-    line +=' num_teams(num_teams) thread_limit(nthread) '
+    line +=' thread_limit(nthread) ' #num_teams(num_teams) 
     map_clause = ''
     if maptype == 'map':
       map_clause = 'map(to:'
@@ -675,10 +707,13 @@ def op2_gen_openmp4(master, date, consts, kernels):
 # map extra pointers for indirect version
 #
     if ninds>0:
-      if maptype == 'map':
-        line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],'
+      if atomics:
+        line += '\\\n'+(depth+2)*' '+'map(to:'
       else:
-        line += '\\\n'+(depth+2)*' '+'map(to:col_reord,'
+        if maptype == 'map':
+          line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],'
+        else:
+          line += '\\\n'+(depth+2)*' '+'map(to:col_reord,'
       if nmaps > 0:
         k = []
         for g_m in range(0,nargs):
@@ -706,8 +741,11 @@ def op2_gen_openmp4(master, date, consts, kernels):
 # start for loop indirect version
 #
     if ninds>0:
-      FOR('e','start','end')
-      code('int n_op = col_reord[e];')
+      if atomics:
+        FOR('n_op', 'start', 'end')
+      else:
+        FOR('e','start','end')
+        code('int n_op = col_reord[e];')
       if nmaps > 0:
         k = []
         for g_m in range(0,nargs):
@@ -779,7 +817,11 @@ def op2_gen_openmp4(master, date, consts, kernels):
           else:
             line = ''
         else:
-          if soaflags[g_m]:
+          if atomics and accs[g_m] == OP_INC:
+            code(typs[g_m] + ' arg'+str(g_m)+'_l['+dims[g_m]+'];')
+            code('for (int d = 0; d < '+dims[g_m]+';d++) arg'+str(g_m)+'_l[d] = 0;')
+            line += '&arg'+str(g_m)+'_l[0]'
+          elif soaflags[g_m]:
             line += '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]'
           else:
             line += '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
@@ -801,6 +843,16 @@ def op2_gen_openmp4(master, date, consts, kernels):
     code(inline_body_text)
     comm('end inline func')
 
+    for g_m in range(0,nargs):
+      if maps[g_m] == OP_MAP  and atomics and accs[g_m] == OP_INC:
+        # FOR('d', '0', dims[g_m])
+        for d in range(0,int(dims[g_m])):
+          code('#pragma omp atomic')
+          if soaflags[g_m]:
+            code('data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx + '+str(d)+' * '+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'] += arg%d_l[%d];' % (g_m,d))
+          else:
+            code('data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx + '+str(d)+'] += arg%d_l[%d];' % (g_m,d))
+        # ENDFOR()
     ENDFOR()
     code('')
     # end kernel function

From 45d6097c4f75ec9fa839df632b5e99f36354d9d9 Mon Sep 17 00:00:00 2001
From: Reguly Reguly <regulyistvan@gmail.com>
Date: Tue, 10 Oct 2023 15:49:07 +0300
Subject: [PATCH 4/5] unsafe atomic adds for float/double only

---
 translator/c/op2_gen_cuda_simple.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py
index 09c646025..312857a22 100644
--- a/translator/c/op2_gen_cuda_simple.py
+++ b/translator/c/op2_gen_cuda_simple.py
@@ -118,8 +118,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0)
 
 #Optimization settings
     inc_stage=0
-    op_color2_force=1
-    atomics=0
+    op_color2_force=0
+    atomics=1
 
 
     name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
@@ -766,7 +766,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs, hip = 0)
                 IF('optflags & 1<<'+str(optidxs[g_m]))
               for d in range(0,int(dims[g_m])):
                 atomicAdd = 'atomicAdd'
-                if hip == 1:
+                if hip == 1 and (typs[g_m]=='float' or typs[g_m]=='double'):
                   atomicAdd = 'unsafeAtomicAdd'
                 if soaflags[g_m]:
                   code(atomicAdd+'(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx],<ARG>_l['+str(d)+']);')

From 28aa8dfe00a2e0daa69ab43c8bd97ed37e5aa0b9 Mon Sep 17 00:00:00 2001
From: ristvan <ristvan@ss01-gh01.cm.cluster>
Date: Wed, 20 Dec 2023 14:09:11 -0800
Subject: [PATCH 5/5] int/size_t mismatches, nvhpc with omp offload

---
 makefiles/compilers/c/nvhpc.mk            | 2 +-
 op2/src/cuda/op_cuda_rt_support.cpp       | 4 ++--
 op2/src/openmp4/op_openmp4_decl.cpp       | 4 ++--
 op2/src/openmp4/op_openmp4_rt_support.cpp | 9 +++++----
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/makefiles/compilers/c/nvhpc.mk b/makefiles/compilers/c/nvhpc.mk
index 0245eda5c..68a7af3fc 100644
--- a/makefiles/compilers/c/nvhpc.mk
+++ b/makefiles/compilers/c/nvhpc.mk
@@ -20,5 +20,5 @@ CONFIG_CPP_HAS_OMP ?= true
 GPU_FFLAG := -gpu=fastmath,ptxinfo,lineinfo
 $(foreach arch,$(CUDA_GEN),$(eval GPU_FFLAG := $(GPU_FFLAG),cc$(arch)))
 
-CONFIG_OMP_OFFLOAD_CPPFLAGS ?= -mp=gpu $(GPU_FFLAG)
+CONFIG_OMP_OFFLOAD_CXXFLAGS ?= -mp=gpu $(GPU_FFLAG)
 CONFIG_CPP_HAS_OMP_OFFLOAD ?= true
diff --git a/op2/src/cuda/op_cuda_rt_support.cpp b/op2/src/cuda/op_cuda_rt_support.cpp
index 52414e54d..673d353c1 100644
--- a/op2/src/cuda/op_cuda_rt_support.cpp
+++ b/op2/src/cuda/op_cuda_rt_support.cpp
@@ -94,7 +94,7 @@ void __cutilCheckMsg(const char *errorMessage, const char *file,
 // routines to move arrays to/from GPU device
 //
 
-void op_mvHostToDevice(void **map, int size) {
+void op_mvHostToDevice(void **map, size_t size) {
   if (!OP_hybrid_gpu || size == 0)
     return;
   void *tmp;
@@ -105,7 +105,7 @@ void op_mvHostToDevice(void **map, int size) {
   *map = tmp;
 }
 
-void op_cpHostToDevice(void **data_d, void **data_h, int size) {
+void op_cpHostToDevice(void **data_d, void **data_h, size_t size) {
   if (!OP_hybrid_gpu)
     return;
   if (*data_d != NULL) cutilSafeCall(cudaFree(*data_d));
diff --git a/op2/src/openmp4/op_openmp4_decl.cpp b/op2/src/openmp4/op_openmp4_decl.cpp
index da8048c12..d8ef6980f 100644
--- a/op2/src/openmp4/op_openmp4_decl.cpp
+++ b/op2/src/openmp4/op_openmp4_decl.cpp
@@ -9,9 +9,9 @@
 
 void cutilDeviceInit(int argc, char **argv);
 
-void op_mvHostToDevice(void **map, int size);
+void op_mvHostToDevice(void **map, size_t size);
 
-void op_cpHostToDevice(void **data_d, void **data_h, int size);
+void op_cpHostToDevice(void **data_d, void **data_h, size_t size);
 
 void op_cuda_exit();
 
diff --git a/op2/src/openmp4/op_openmp4_rt_support.cpp b/op2/src/openmp4/op_openmp4_rt_support.cpp
index 078484915..32a59f7ad 100644
--- a/op2/src/openmp4/op_openmp4_rt_support.cpp
+++ b/op2/src/openmp4/op_openmp4_rt_support.cpp
@@ -19,7 +19,7 @@
 // routines to move arrays to/from GPU device
 //
 
-void op_mvHostToDevice(void **map, int size) {
+void op_mvHostToDevice(void **map, size_t size) {
   if (!OP_hybrid_gpu)
     return;
   char *temp = (char*)*map;
@@ -28,7 +28,7 @@ void op_mvHostToDevice(void **map, int size) {
   //TODO test
 }
 
-void op_cpHostToDevice(void **data_d, void **data_h, int size) {
+void op_cpHostToDevice(void **data_d, void **data_h, size_t size) {
   if (!OP_hybrid_gpu)
     return;
   *data_d = (char*)op_malloc(size);
@@ -121,8 +121,9 @@ void op_cuda_exit() {
     return;
   op_dat_entry *item;
   TAILQ_FOREACH(item, &OP_dat_list, entries) {
-    #pragma omp target exit data map(from: (item->dat)->data_d)
-    free((item->dat)->data_d);
+    char *data_d = (item->dat)->data_d;
+    #pragma omp target exit data map(from: data_d)
+    free(data_d);
   }
   /*
   for (int ip = 0; ip < OP_plan_index; ip++) {